1#! /usr/bin/env python
2
3"""
4Module difflib -- helpers for computing deltas between objects.
5
6Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
7    Use SequenceMatcher to return list of the best "good enough" matches.
8
9Function context_diff(a, b):
10    For two lists of strings, return a delta in context diff format.
11
12Function ndiff(a, b):
13    Return a delta: the difference between `a` and `b` (lists of strings).
14
15Function restore(delta, which):
16    Return one of the two sequences that generated an ndiff delta.
17
18Function unified_diff(a, b):
19    For two lists of strings, return a delta in unified diff format.
20
21Class SequenceMatcher:
22    A flexible class for comparing pairs of sequences of any type.
23
24Class Differ:
25    For producing human-readable deltas from sequences of lines of text.
26
27Class HtmlDiff:
28    For producing HTML side by side comparison with change highlights.
29"""
30
31__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
32           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
33           'unified_diff', 'HtmlDiff', 'Match']
34
35import heapq
36from collections import namedtuple as _namedtuple
37from functools import reduce
38
39Match = _namedtuple('Match', 'a b size')
40
41def _calculate_ratio(matches, length):
42    if length:
43        return 2.0 * matches / length
44    return 1.0
45
46class SequenceMatcher:
47
48    """
49    SequenceMatcher is a flexible class for comparing pairs of sequences of
50    any type, so long as the sequence elements are hashable.  The basic
51    algorithm predates, and is a little fancier than, an algorithm
52    published in the late 1980's by Ratcliff and Obershelp under the
53    hyperbolic name "gestalt pattern matching".  The basic idea is to find
54    the longest contiguous matching subsequence that contains no "junk"
55    elements (R-O doesn't address junk).  The same idea is then applied
56    recursively to the pieces of the sequences to the left and to the right
57    of the matching subsequence.  This does not yield minimal edit
58    sequences, but does tend to yield matches that "look right" to people.
59
60    SequenceMatcher tries to compute a "human-friendly diff" between two
61    sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
62    longest *contiguous* & junk-free matching subsequence.  That's what
63    catches peoples' eyes.  The Windows(tm) windiff has another interesting
64    notion, pairing up elements that appear uniquely in each sequence.
65    That, and the method here, appear to yield more intuitive difference
66    reports than does diff.  This method appears to be the least vulnerable
67    to synching up on blocks of "junk lines", though (like blank lines in
68    ordinary text files, or maybe "<P>" lines in HTML files).  That may be
69    because this is the only method of the 3 that has a *concept* of
70    "junk" <wink>.
71
72    Example, comparing two strings, and considering blanks to be "junk":
73
74    >>> s = SequenceMatcher(lambda x: x == " ",
75    ...                     "private Thread currentThread;",
76    ...                     "private volatile Thread currentThread;")
77    >>>
78
79    .ratio() returns a float in [0, 1], measuring the "similarity" of the
80    sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
81    sequences are close matches:
82
83    >>> print round(s.ratio(), 3)
84    0.866
85    >>>
86
87    If you're only interested in where the sequences match,
88    .get_matching_blocks() is handy:
89
90    >>> for block in s.get_matching_blocks():
91    ...     print "a[%d] and b[%d] match for %d elements" % block
92    a[0] and b[0] match for 8 elements
93    a[8] and b[17] match for 21 elements
94    a[29] and b[38] match for 0 elements
95
96    Note that the last tuple returned by .get_matching_blocks() is always a
97    dummy, (len(a), len(b), 0), and this is the only case in which the last
98    tuple element (number of elements matched) is 0.
99
100    If you want to know how to change the first sequence into the second,
101    use .get_opcodes():
102
103    >>> for opcode in s.get_opcodes():
104    ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
105     equal a[0:8] b[0:8]
106    insert a[8:8] b[8:17]
107     equal a[8:29] b[17:38]
108
109    See the Differ class for a fancy human-friendly file differencer, which
110    uses SequenceMatcher both to compare sequences of lines, and to compare
111    sequences of characters within similar (near-matching) lines.
112
113    See also function get_close_matches() in this module, which shows how
114    simple code building on SequenceMatcher can be used to do useful work.
115
116    Timing:  Basic R-O is cubic time worst case and quadratic time expected
117    case.  SequenceMatcher is quadratic time for the worst case and has
118    expected-case behavior dependent in a complicated way on how many
119    elements the sequences have in common; best case time is linear.
120
121    Methods:
122
123    __init__(isjunk=None, a='', b='')
124        Construct a SequenceMatcher.
125
126    set_seqs(a, b)
127        Set the two sequences to be compared.
128
129    set_seq1(a)
130        Set the first sequence to be compared.
131
132    set_seq2(b)
133        Set the second sequence to be compared.
134
135    find_longest_match(alo, ahi, blo, bhi)
136        Find longest matching block in a[alo:ahi] and b[blo:bhi].
137
138    get_matching_blocks()
139        Return list of triples describing matching subsequences.
140
141    get_opcodes()
142        Return list of 5-tuples describing how to turn a into b.
143
144    ratio()
145        Return a measure of the sequences' similarity (float in [0,1]).
146
147    quick_ratio()
148        Return an upper bound on .ratio() relatively quickly.
149
150    real_quick_ratio()
151        Return an upper bound on ratio() very quickly.
152    """
153
154    def __init__(self, isjunk=None, a='', b='', autojunk=True):
155        """Construct a SequenceMatcher.
156
157        Optional arg isjunk is None (the default), or a one-argument
158        function that takes a sequence element and returns true iff the
159        element is junk.  None is equivalent to passing "lambda x: 0", i.e.
160        no elements are considered to be junk.  For example, pass
161            lambda x: x in " \\t"
162        if you're comparing lines as sequences of characters, and don't
163        want to synch up on blanks or hard tabs.
164
165        Optional arg a is the first of two sequences to be compared.  By
166        default, an empty string.  The elements of a must be hashable.  See
167        also .set_seqs() and .set_seq1().
168
169        Optional arg b is the second of two sequences to be compared.  By
170        default, an empty string.  The elements of b must be hashable. See
171        also .set_seqs() and .set_seq2().
172
173        Optional arg autojunk should be set to False to disable the
174        "automatic junk heuristic" that treats popular elements as junk
175        (see module documentation for more information).
176        """
177
178        # Members:
179        # a
180        #      first sequence
181        # b
182        #      second sequence; differences are computed as "what do
183        #      we need to do to 'a' to change it into 'b'?"
184        # b2j
185        #      for x in b, b2j[x] is a list of the indices (into b)
186        #      at which x appears; junk elements do not appear
187        # fullbcount
188        #      for x in b, fullbcount[x] == the number of times x
189        #      appears in b; only materialized if really needed (used
190        #      only for computing quick_ratio())
191        # matching_blocks
192        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
193        #      ascending & non-overlapping in i and in j; terminated by
194        #      a dummy (len(a), len(b), 0) sentinel
195        # opcodes
196        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
197        #      one of
198        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
199        #          'delete'    a[i1:i2] should be deleted
200        #          'insert'    b[j1:j2] should be inserted
201        #          'equal'     a[i1:i2] == b[j1:j2]
202        # isjunk
203        #      a user-supplied function taking a sequence element and
204        #      returning true iff the element is "junk" -- this has
205        #      subtle but helpful effects on the algorithm, which I'll
206        #      get around to writing up someday <0.9 wink>.
207        #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
208        # isbjunk
209        #      for x in b, isbjunk(x) == isjunk(x) but much faster;
210        #      it's really the __contains__ method of a hidden dict.
211        #      DOES NOT WORK for x in a!
212        # isbpopular
213        #      for x in b, isbpopular(x) is true iff b is reasonably long
214        #      (at least 200 elements) and x accounts for more than 1 + 1% of
215        #      its elements (when autojunk is enabled).
216        #      DOES NOT WORK for x in a!
217
218        self.isjunk = isjunk
219        self.a = self.b = None
220        self.autojunk = autojunk
221        self.set_seqs(a, b)
222
223    def set_seqs(self, a, b):
224        """Set the two sequences to be compared.
225
226        >>> s = SequenceMatcher()
227        >>> s.set_seqs("abcd", "bcde")
228        >>> s.ratio()
229        0.75
230        """
231
232        self.set_seq1(a)
233        self.set_seq2(b)
234
235    def set_seq1(self, a):
236        """Set the first sequence to be compared.
237
238        The second sequence to be compared is not changed.
239
240        >>> s = SequenceMatcher(None, "abcd", "bcde")
241        >>> s.ratio()
242        0.75
243        >>> s.set_seq1("bcde")
244        >>> s.ratio()
245        1.0
246        >>>
247
248        SequenceMatcher computes and caches detailed information about the
249        second sequence, so if you want to compare one sequence S against
250        many sequences, use .set_seq2(S) once and call .set_seq1(x)
251        repeatedly for each of the other sequences.
252
253        See also set_seqs() and set_seq2().
254        """
255
256        if a is self.a:
257            return
258        self.a = a
259        self.matching_blocks = self.opcodes = None
260
261    def set_seq2(self, b):
262        """Set the second sequence to be compared.
263
264        The first sequence to be compared is not changed.
265
266        >>> s = SequenceMatcher(None, "abcd", "bcde")
267        >>> s.ratio()
268        0.75
269        >>> s.set_seq2("abcd")
270        >>> s.ratio()
271        1.0
272        >>>
273
274        SequenceMatcher computes and caches detailed information about the
275        second sequence, so if you want to compare one sequence S against
276        many sequences, use .set_seq2(S) once and call .set_seq1(x)
277        repeatedly for each of the other sequences.
278
279        See also set_seqs() and set_seq1().
280        """
281
282        if b is self.b:
283            return
284        self.b = b
285        self.matching_blocks = self.opcodes = None
286        self.fullbcount = None
287        self.__chain_b()
288
289    # For each element x in b, set b2j[x] to a list of the indices in
290    # b where x appears; the indices are in increasing order; note that
291    # the number of times x appears in b is len(b2j[x]) ...
292    # when self.isjunk is defined, junk elements don't show up in this
293    # map at all, which stops the central find_longest_match method
294    # from starting any matching block at a junk element ...
295    # also creates the fast isbjunk function ...
296    # b2j also does not contain entries for "popular" elements, meaning
297    # elements that account for more than 1 + 1% of the total elements, and
298    # when the sequence is reasonably large (>= 200 elements); this can
299    # be viewed as an adaptive notion of semi-junk, and yields an enormous
300    # speedup when, e.g., comparing program files with hundreds of
301    # instances of "return NULL;" ...
302    # note that this is only called when b changes; so for cross-product
303    # kinds of matches, it's best to call set_seq2 once, then set_seq1
304    # repeatedly
305
306    def __chain_b(self):
307        # Because isjunk is a user-defined (not C) function, and we test
308        # for junk a LOT, it's important to minimize the number of calls.
309        # Before the tricks described here, __chain_b was by far the most
310        # time-consuming routine in the whole module!  If anyone sees
311        # Jim Roskind, thank him again for profile.py -- I never would
312        # have guessed that.
313        # The first trick is to build b2j ignoring the possibility
314        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
315        # out the junk later is much cheaper than building b2j "right"
316        # from the start.
317        b = self.b
318        self.b2j = b2j = {}
319
320        for i, elt in enumerate(b):
321            indices = b2j.setdefault(elt, [])
322            indices.append(i)
323
324        # Purge junk elements
325        junk = set()
326        isjunk = self.isjunk
327        if isjunk:
328            for elt in list(b2j.keys()):  # using list() since b2j is modified
329                if isjunk(elt):
330                    junk.add(elt)
331                    del b2j[elt]
332
333        # Purge popular elements that are not junk
334        popular = set()
335        n = len(b)
336        if self.autojunk and n >= 200:
337            ntest = n // 100 + 1
338            for elt, idxs in list(b2j.items()):
339                if len(idxs) > ntest:
340                    popular.add(elt)
341                    del b2j[elt]
342
343        # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
344        # Sicne the number of *unique* junk elements is probably small, the
345        # memory burden of keeping this set alive is likely trivial compared to
346        # the size of b2j.
347        self.isbjunk = junk.__contains__
348        self.isbpopular = popular.__contains__
349
350    def find_longest_match(self, alo, ahi, blo, bhi):
351        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
352
353        If isjunk is not defined:
354
355        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
356            alo <= i <= i+k <= ahi
357            blo <= j <= j+k <= bhi
358        and for all (i',j',k') meeting those conditions,
359            k >= k'
360            i <= i'
361            and if i == i', j <= j'
362
363        In other words, of all maximal matching blocks, return one that
364        starts earliest in a, and of all those maximal matching blocks that
365        start earliest in a, return the one that starts earliest in b.
366
367        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
368        >>> s.find_longest_match(0, 5, 0, 9)
369        Match(a=0, b=4, size=5)
370
371        If isjunk is defined, first the longest matching block is
372        determined as above, but with the additional restriction that no
373        junk element appears in the block.  Then that block is extended as
374        far as possible by matching (only) junk elements on both sides.  So
375        the resulting block never matches on junk except as identical junk
376        happens to be adjacent to an "interesting" match.
377
378        Here's the same example as before, but considering blanks to be
379        junk.  That prevents " abcd" from matching the " abcd" at the tail
380        end of the second sequence directly.  Instead only the "abcd" can
381        match, and matches the leftmost "abcd" in the second sequence:
382
383        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
384        >>> s.find_longest_match(0, 5, 0, 9)
385        Match(a=1, b=0, size=4)
386
387        If no blocks match, return (alo, blo, 0).
388
389        >>> s = SequenceMatcher(None, "ab", "c")
390        >>> s.find_longest_match(0, 2, 0, 1)
391        Match(a=0, b=0, size=0)
392        """
393
394        # CAUTION:  stripping common prefix or suffix would be incorrect.
395        # E.g.,
396        #    ab
397        #    acab
398        # Longest matching block is "ab", but if common prefix is
399        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
400        # strip, so ends up claiming that ab is changed to acab by
401        # inserting "ca" in the middle.  That's minimal but unintuitive:
402        # "it's obvious" that someone inserted "ac" at the front.
403        # Windiff ends up at the same place as diff, but by pairing up
404        # the unique 'b's and then matching the first two 'a's.
405
406        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
407        besti, bestj, bestsize = alo, blo, 0
408        # find longest junk-free match
409        # during an iteration of the loop, j2len[j] = length of longest
410        # junk-free match ending with a[i-1] and b[j]
411        j2len = {}
412        nothing = []
413        for i in xrange(alo, ahi):
414            # look at all instances of a[i] in b; note that because
415            # b2j has no junk keys, the loop is skipped if a[i] is junk
416            j2lenget = j2len.get
417            newj2len = {}
418            for j in b2j.get(a[i], nothing):
419                # a[i] matches b[j]
420                if j < blo:
421                    continue
422                if j >= bhi:
423                    break
424                k = newj2len[j] = j2lenget(j-1, 0) + 1
425                if k > bestsize:
426                    besti, bestj, bestsize = i-k+1, j-k+1, k
427            j2len = newj2len
428
429        # Extend the best by non-junk elements on each end.  In particular,
430        # "popular" non-junk elements aren't in b2j, which greatly speeds
431        # the inner loop above, but also means "the best" match so far
432        # doesn't contain any junk *or* popular non-junk elements.
433        while besti > alo and bestj > blo and \
434              not isbjunk(b[bestj-1]) and \
435              a[besti-1] == b[bestj-1]:
436            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
437        while besti+bestsize < ahi and bestj+bestsize < bhi and \
438              not isbjunk(b[bestj+bestsize]) and \
439              a[besti+bestsize] == b[bestj+bestsize]:
440            bestsize += 1
441
442        # Now that we have a wholly interesting match (albeit possibly
443        # empty!), we may as well suck up the matching junk on each
444        # side of it too.  Can't think of a good reason not to, and it
445        # saves post-processing the (possibly considerable) expense of
446        # figuring out what to do with it.  In the case of an empty
447        # interesting match, this is clearly the right thing to do,
448        # because no other kind of match is possible in the regions.
449        while besti > alo and bestj > blo and \
450              isbjunk(b[bestj-1]) and \
451              a[besti-1] == b[bestj-1]:
452            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
453        while besti+bestsize < ahi and bestj+bestsize < bhi and \
454              isbjunk(b[bestj+bestsize]) and \
455              a[besti+bestsize] == b[bestj+bestsize]:
456            bestsize = bestsize + 1
457
458        return Match(besti, bestj, bestsize)
459
460    def get_matching_blocks(self):
461        """Return list of triples describing matching subsequences.
462
463        Each triple is of the form (i, j, n), and means that
464        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
465        i and in j.  New in Python 2.5, it's also guaranteed that if
466        (i, j, n) and (i', j', n') are adjacent triples in the list, and
467        the second is not the last triple in the list, then i+n != i' or
468        j+n != j'.  IOW, adjacent triples never describe adjacent equal
469        blocks.
470
471        The last triple is a dummy, (len(a), len(b), 0), and is the only
472        triple with n==0.
473
474        >>> s = SequenceMatcher(None, "abxcd", "abcd")
475        >>> s.get_matching_blocks()
476        [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
477        """
478
479        if self.matching_blocks is not None:
480            return self.matching_blocks
481        la, lb = len(self.a), len(self.b)
482
483        # This is most naturally expressed as a recursive algorithm, but
484        # at least one user bumped into extreme use cases that exceeded
485        # the recursion limit on their box.  So, now we maintain a list
486        # ('queue`) of blocks we still need to look at, and append partial
487        # results to `matching_blocks` in a loop; the matches are sorted
488        # at the end.
489        queue = [(0, la, 0, lb)]
490        matching_blocks = []
491        while queue:
492            alo, ahi, blo, bhi = queue.pop()
493            i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
494            # a[alo:i] vs b[blo:j] unknown
495            # a[i:i+k] same as b[j:j+k]
496            # a[i+k:ahi] vs b[j+k:bhi] unknown
497            if k:   # if k is 0, there was no matching block
498                matching_blocks.append(x)
499                if alo < i and blo < j:
500                    queue.append((alo, i, blo, j))
501                if i+k < ahi and j+k < bhi:
502                    queue.append((i+k, ahi, j+k, bhi))
503        matching_blocks.sort()
504
505        # It's possible that we have adjacent equal blocks in the
506        # matching_blocks list now.  Starting with 2.5, this code was added
507        # to collapse them.
508        i1 = j1 = k1 = 0
509        non_adjacent = []
510        for i2, j2, k2 in matching_blocks:
511            # Is this block adjacent to i1, j1, k1?
512            if i1 + k1 == i2 and j1 + k1 == j2:
513                # Yes, so collapse them -- this just increases the length of
514                # the first block by the length of the second, and the first
515                # block so lengthened remains the block to compare against.
516                k1 += k2
517            else:
518                # Not adjacent.  Remember the first block (k1==0 means it's
519                # the dummy we started with), and make the second block the
520                # new block to compare against.
521                if k1:
522                    non_adjacent.append((i1, j1, k1))
523                i1, j1, k1 = i2, j2, k2
524        if k1:
525            non_adjacent.append((i1, j1, k1))
526
527        non_adjacent.append( (la, lb, 0) )
528        self.matching_blocks = non_adjacent
529        return map(Match._make, self.matching_blocks)
530
531    def get_opcodes(self):
532        """Return list of 5-tuples describing how to turn a into b.
533
534        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
535        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
536        tuple preceding it, and likewise for j1 == the previous j2.
537
538        The tags are strings, with these meanings:
539
540        'replace':  a[i1:i2] should be replaced by b[j1:j2]
541        'delete':   a[i1:i2] should be deleted.
542                    Note that j1==j2 in this case.
543        'insert':   b[j1:j2] should be inserted at a[i1:i1].
544                    Note that i1==i2 in this case.
545        'equal':    a[i1:i2] == b[j1:j2]
546
547        >>> a = "qabxcd"
548        >>> b = "abycdf"
549        >>> s = SequenceMatcher(None, a, b)
550        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
551        ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
552        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
553         delete a[0:1] (q) b[0:0] ()
554          equal a[1:3] (ab) b[0:2] (ab)
555        replace a[3:4] (x) b[2:3] (y)
556          equal a[4:6] (cd) b[3:5] (cd)
557         insert a[6:6] () b[5:6] (f)
558        """
559
560        if self.opcodes is not None:
561            return self.opcodes
562        i = j = 0
563        self.opcodes = answer = []
564        for ai, bj, size in self.get_matching_blocks():
565            # invariant:  we've pumped out correct diffs to change
566            # a[:i] into b[:j], and the next matching block is
567            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
568            # out a diff to change a[i:ai] into b[j:bj], pump out
569            # the matching block, and move (i,j) beyond the match
570            tag = ''
571            if i < ai and j < bj:
572                tag = 'replace'
573            elif i < ai:
574                tag = 'delete'
575            elif j < bj:
576                tag = 'insert'
577            if tag:
578                answer.append( (tag, i, ai, j, bj) )
579            i, j = ai+size, bj+size
580            # the list of matching blocks is terminated by a
581            # sentinel with size 0
582            if size:
583                answer.append( ('equal', ai, i, bj, j) )
584        return answer
585
586    def get_grouped_opcodes(self, n=3):
587        """ Isolate change clusters by eliminating ranges with no changes.
588
589        Return a generator of groups with upto n lines of context.
590        Each group is in the same format as returned by get_opcodes().
591
592        >>> from pprint import pprint
593        >>> a = map(str, range(1,40))
594        >>> b = a[:]
595        >>> b[8:8] = ['i']     # Make an insertion
596        >>> b[20] += 'x'       # Make a replacement
597        >>> b[23:28] = []      # Make a deletion
598        >>> b[30] += 'y'       # Make another replacement
599        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
600        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
601         [('equal', 16, 19, 17, 20),
602          ('replace', 19, 20, 20, 21),
603          ('equal', 20, 22, 21, 23),
604          ('delete', 22, 27, 23, 23),
605          ('equal', 27, 30, 23, 26)],
606         [('equal', 31, 34, 27, 30),
607          ('replace', 34, 35, 30, 31),
608          ('equal', 35, 38, 31, 34)]]
609        """
610
611        codes = self.get_opcodes()
612        if not codes:
613            codes = [("equal", 0, 1, 0, 1)]
614        # Fixup leading and trailing groups if they show no changes.
615        if codes[0][0] == 'equal':
616            tag, i1, i2, j1, j2 = codes[0]
617            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
618        if codes[-1][0] == 'equal':
619            tag, i1, i2, j1, j2 = codes[-1]
620            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
621
622        nn = n + n
623        group = []
624        for tag, i1, i2, j1, j2 in codes:
625            # End the current group and start a new one whenever
626            # there is a large range with no changes.
627            if tag == 'equal' and i2-i1 > nn:
628                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
629                yield group
630                group = []
631                i1, j1 = max(i1, i2-n), max(j1, j2-n)
632            group.append((tag, i1, i2, j1 ,j2))
633        if group and not (len(group)==1 and group[0][0] == 'equal'):
634            yield group
635
636    def ratio(self):
637        """Return a measure of the sequences' similarity (float in [0,1]).
638
639        Where T is the total number of elements in both sequences, and
640        M is the number of matches, this is 2.0*M / T.
641        Note that this is 1 if the sequences are identical, and 0 if
642        they have nothing in common.
643
644        .ratio() is expensive to compute if you haven't already computed
645        .get_matching_blocks() or .get_opcodes(), in which case you may
646        want to try .quick_ratio() or .real_quick_ratio() first to get an
647        upper bound.
648
649        >>> s = SequenceMatcher(None, "abcd", "bcde")
650        >>> s.ratio()
651        0.75
652        >>> s.quick_ratio()
653        0.75
654        >>> s.real_quick_ratio()
655        1.0
656        """
657
658        matches = reduce(lambda sum, triple: sum + triple[-1],
659                         self.get_matching_blocks(), 0)
660        return _calculate_ratio(matches, len(self.a) + len(self.b))
661
662    def quick_ratio(self):
663        """Return an upper bound on ratio() relatively quickly.
664
665        This isn't defined beyond that it is an upper bound on .ratio(), and
666        is faster to compute.
667        """
668
669        # viewing a and b as multisets, set matches to the cardinality
670        # of their intersection; this counts the number of matches
671        # without regard to order, so is clearly an upper bound
672        if self.fullbcount is None:
673            self.fullbcount = fullbcount = {}
674            for elt in self.b:
675                fullbcount[elt] = fullbcount.get(elt, 0) + 1
676        fullbcount = self.fullbcount
677        # avail[x] is the number of times x appears in 'b' less the
678        # number of times we've seen it in 'a' so far ... kinda
679        avail = {}
680        availhas, matches = avail.__contains__, 0
681        for elt in self.a:
682            if availhas(elt):
683                numb = avail[elt]
684            else:
685                numb = fullbcount.get(elt, 0)
686            avail[elt] = numb - 1
687            if numb > 0:
688                matches = matches + 1
689        return _calculate_ratio(matches, len(self.a) + len(self.b))
690
691    def real_quick_ratio(self):
692        """Return an upper bound on ratio() very quickly.
693
694        This isn't defined beyond that it is an upper bound on .ratio(), and
695        is faster to compute than either .ratio() or .quick_ratio().
696        """
697
698        la, lb = len(self.a), len(self.b)
699        # can't have more matches than the number of elements in the
700        # shorter sequence
701        return _calculate_ratio(min(la, lb), la + lb)
702
703def get_close_matches(word, possibilities, n=3, cutoff=0.6):
704    """Use SequenceMatcher to return list of the best "good enough" matches.
705
706    word is a sequence for which close matches are desired (typically a
707    string).
708
709    possibilities is a list of sequences against which to match word
710    (typically a list of strings).
711
712    Optional arg n (default 3) is the maximum number of close matches to
713    return.  n must be > 0.
714
715    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
716    that don't score at least that similar to word are ignored.
717
718    The best (no more than n) matches among the possibilities are returned
719    in a list, sorted by similarity score, most similar first.
720
721    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
722    ['apple', 'ape']
723    >>> import keyword as _keyword
724    >>> get_close_matches("wheel", _keyword.kwlist)
725    ['while']
726    >>> get_close_matches("apple", _keyword.kwlist)
727    []
728    >>> get_close_matches("accept", _keyword.kwlist)
729    ['except']
730    """
731
732    if not n >  0:
733        raise ValueError("n must be > 0: %r" % (n,))
734    if not 0.0 <= cutoff <= 1.0:
735        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
736    result = []
737    s = SequenceMatcher()
738    s.set_seq2(word)
739    for x in possibilities:
740        s.set_seq1(x)
741        if s.real_quick_ratio() >= cutoff and \
742           s.quick_ratio() >= cutoff and \
743           s.ratio() >= cutoff:
744            result.append((s.ratio(), x))
745
746    # Move the best scorers to head of list
747    result = heapq.nlargest(n, result)
748    # Strip scores for the best n matches
749    return [x for score, x in result]
750
751def _count_leading(line, ch):
752    """
753    Return number of `ch` characters at the start of `line`.
754
755    Example:
756
757    >>> _count_leading('   abc', ' ')
758    3
759    """
760
761    i, n = 0, len(line)
762    while i < n and line[i] == ch:
763        i += 1
764    return i
765
766class Differ:
767    r"""
768    Differ is a class for comparing sequences of lines of text, and
769    producing human-readable differences or deltas.  Differ uses
770    SequenceMatcher both to compare sequences of lines, and to compare
771    sequences of characters within similar (near-matching) lines.
772
773    Each line of a Differ delta begins with a two-letter code:
774
775        '- '    line unique to sequence 1
776        '+ '    line unique to sequence 2
777        '  '    line common to both sequences
778        '? '    line not present in either input sequence
779
780    Lines beginning with '? ' attempt to guide the eye to intraline
781    differences, and were not present in either input sequence.  These lines
782    can be confusing if the sequences contain tab characters.
783
784    Note that Differ makes no claim to produce a *minimal* diff.  To the
785    contrary, minimal diffs are often counter-intuitive, because they synch
786    up anywhere possible, sometimes accidental matches 100 pages apart.
787    Restricting synch points to contiguous matches preserves some notion of
788    locality, at the occasional cost of producing a longer diff.
789
790    Example: Comparing two texts.
791
792    First we set up the texts, sequences of individual single-line strings
793    ending with newlines (such sequences can also be obtained from the
794    `readlines()` method of file-like objects):
795
796    >>> text1 = '''  1. Beautiful is better than ugly.
797    ...   2. Explicit is better than implicit.
798    ...   3. Simple is better than complex.
799    ...   4. Complex is better than complicated.
800    ... '''.splitlines(1)
801    >>> len(text1)
802    4
803    >>> text1[0][-1]
804    '\n'
805    >>> text2 = '''  1. Beautiful is better than ugly.
806    ...   3.   Simple is better than complex.
807    ...   4. Complicated is better than complex.
808    ...   5. Flat is better than nested.
809    ... '''.splitlines(1)
810
811    Next we instantiate a Differ object:
812
813    >>> d = Differ()
814
815    Note that when instantiating a Differ object we may pass functions to
816    filter out line and character 'junk'.  See Differ.__init__ for details.
817
818    Finally, we compare the two:
819
820    >>> result = list(d.compare(text1, text2))
821
822    'result' is a list of strings, so let's pretty-print it:
823
824    >>> from pprint import pprint as _pprint
825    >>> _pprint(result)
826    ['    1. Beautiful is better than ugly.\n',
827     '-   2. Explicit is better than implicit.\n',
828     '-   3. Simple is better than complex.\n',
829     '+   3.   Simple is better than complex.\n',
830     '?     ++\n',
831     '-   4. Complex is better than complicated.\n',
832     '?            ^                     ---- ^\n',
833     '+   4. Complicated is better than complex.\n',
834     '?           ++++ ^                      ^\n',
835     '+   5. Flat is better than nested.\n']
836
837    As a single multi-line string it looks like this:
838
839    >>> print ''.join(result),
840        1. Beautiful is better than ugly.
841    -   2. Explicit is better than implicit.
842    -   3. Simple is better than complex.
843    +   3.   Simple is better than complex.
844    ?     ++
845    -   4. Complex is better than complicated.
846    ?            ^                     ---- ^
847    +   4. Complicated is better than complex.
848    ?           ++++ ^                      ^
849    +   5. Flat is better than nested.
850
851    Methods:
852
853    __init__(linejunk=None, charjunk=None)
854        Construct a text differencer, with optional filters.
855
856    compare(a, b)
857        Compare two sequences of lines; generate the resulting delta.
858    """
859
860    def __init__(self, linejunk=None, charjunk=None):
861        """
862        Construct a text differencer, with optional filters.
863
864        The two optional keyword parameters are for filter functions:
865
866        - `linejunk`: A function that should accept a single string argument,
867          and return true iff the string is junk. The module-level function
868          `IS_LINE_JUNK` may be used to filter out lines without visible
869          characters, except for at most one splat ('#').  It is recommended
870          to leave linejunk None; as of Python 2.3, the underlying
871          SequenceMatcher class has grown an adaptive notion of "noise" lines
872          that's better than any static definition the author has ever been
873          able to craft.
874
875        - `charjunk`: A function that should accept a string of length 1. The
876          module-level function `IS_CHARACTER_JUNK` may be used to filter out
877          whitespace characters (a blank or tab; **note**: bad idea to include
878          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
879        """
880
881        self.linejunk = linejunk
882        self.charjunk = charjunk
883
884    def compare(self, a, b):
885        r"""
886        Compare two sequences of lines; generate the resulting delta.
887
888        Each sequence must contain individual single-line strings ending with
889        newlines. Such sequences can be obtained from the `readlines()` method
890        of file-like objects.  The delta generated also consists of newline-
891        terminated strings, ready to be printed as-is via the writeline()
892        method of a file-like object.
893
894        Example:
895
896        >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
897        ...                                'ore\ntree\nemu\n'.splitlines(1))),
898        - one
899        ?  ^
900        + ore
901        ?  ^
902        - two
903        - three
904        ?  -
905        + tree
906        + emu
907        """
908
909        cruncher = SequenceMatcher(self.linejunk, a, b)
910        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
911            if tag == 'replace':
912                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
913            elif tag == 'delete':
914                g = self._dump('-', a, alo, ahi)
915            elif tag == 'insert':
916                g = self._dump('+', b, blo, bhi)
917            elif tag == 'equal':
918                g = self._dump(' ', a, alo, ahi)
919            else:
920                raise ValueError, 'unknown tag %r' % (tag,)
921
922            for line in g:
923                yield line
924
925    def _dump(self, tag, x, lo, hi):
926        """Generate comparison results for a same-tagged range."""
927        for i in xrange(lo, hi):
928            yield '%s %s' % (tag, x[i])
929
930    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
931        assert alo < ahi and blo < bhi
932        # dump the shorter block first -- reduces the burden on short-term
933        # memory if the blocks are of very different sizes
934        if bhi - blo < ahi - alo:
935            first  = self._dump('+', b, blo, bhi)
936            second = self._dump('-', a, alo, ahi)
937        else:
938            first  = self._dump('-', a, alo, ahi)
939            second = self._dump('+', b, blo, bhi)
940
941        for g in first, second:
942            for line in g:
943                yield line
944
945    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
946        r"""
947        When replacing one block of lines with another, search the blocks
948        for *similar* lines; the best-matching pair (if any) is used as a
949        synch point, and intraline difference marking is done on the
950        similar pair. Lots of work, but often worth it.
951
952        Example:
953
954        >>> d = Differ()
955        >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
956        ...                            ['abcdefGhijkl\n'], 0, 1)
957        >>> print ''.join(results),
958        - abcDefghiJkl
959        ?    ^  ^  ^
960        + abcdefGhijkl
961        ?    ^  ^  ^
962        """
963
964        # don't synch up unless the lines have a similarity score of at
965        # least cutoff; best_ratio tracks the best score seen so far
966        best_ratio, cutoff = 0.74, 0.75
967        cruncher = SequenceMatcher(self.charjunk)
968        eqi, eqj = None, None   # 1st indices of equal lines (if any)
969
970        # search for the pair that matches best without being identical
971        # (identical lines must be junk lines, & we don't want to synch up
972        # on junk -- unless we have to)
973        for j in xrange(blo, bhi):
974            bj = b[j]
975            cruncher.set_seq2(bj)
976            for i in xrange(alo, ahi):
977                ai = a[i]
978                if ai == bj:
979                    if eqi is None:
980                        eqi, eqj = i, j
981                    continue
982                cruncher.set_seq1(ai)
983                # computing similarity is expensive, so use the quick
984                # upper bounds first -- have seen this speed up messy
985                # compares by a factor of 3.
986                # note that ratio() is only expensive to compute the first
987                # time it's called on a sequence pair; the expensive part
988                # of the computation is cached by cruncher
989                if cruncher.real_quick_ratio() > best_ratio and \
990                      cruncher.quick_ratio() > best_ratio and \
991                      cruncher.ratio() > best_ratio:
992                    best_ratio, best_i, best_j = cruncher.ratio(), i, j
993        if best_ratio < cutoff:
994            # no non-identical "pretty close" pair
995            if eqi is None:
996                # no identical pair either -- treat it as a straight replace
997                for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
998                    yield line
999                return
1000            # no close pair, but an identical pair -- synch up on that
1001            best_i, best_j, best_ratio = eqi, eqj, 1.0
1002        else:
1003            # there's a close pair, so forget the identical pair (if any)
1004            eqi = None
1005
1006        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
1007        # identical
1008
1009        # pump out diffs from before the synch point
1010        for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
1011            yield line
1012
1013        # do intraline marking on the synch pair
1014        aelt, belt = a[best_i], b[best_j]
1015        if eqi is None:
1016            # pump out a '-', '?', '+', '?' quad for the synched lines
1017            atags = btags = ""
1018            cruncher.set_seqs(aelt, belt)
1019            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
1020                la, lb = ai2 - ai1, bj2 - bj1
1021                if tag == 'replace':
1022                    atags += '^' * la
1023                    btags += '^' * lb
1024                elif tag == 'delete':
1025                    atags += '-' * la
1026                elif tag == 'insert':
1027                    btags += '+' * lb
1028                elif tag == 'equal':
1029                    atags += ' ' * la
1030                    btags += ' ' * lb
1031                else:
1032                    raise ValueError, 'unknown tag %r' % (tag,)
1033            for line in self._qformat(aelt, belt, atags, btags):
1034                yield line
1035        else:
1036            # the synch pair is identical
1037            yield '  ' + aelt
1038
1039        # pump out diffs from after the synch point
1040        for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
1041            yield line
1042
1043    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1044        g = []
1045        if alo < ahi:
1046            if blo < bhi:
1047                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1048            else:
1049                g = self._dump('-', a, alo, ahi)
1050        elif blo < bhi:
1051            g = self._dump('+', b, blo, bhi)
1052
1053        for line in g:
1054            yield line
1055
1056    def _qformat(self, aline, bline, atags, btags):
1057        r"""
1058        Format "?" output and deal with leading tabs.
1059
1060        Example:
1061
1062        >>> d = Differ()
1063        >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
1064        ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
1065        >>> for line in results: print repr(line)
1066        ...
1067        '- \tabcDefghiJkl\n'
1068        '? \t ^ ^  ^\n'
1069        '+ \tabcdefGhijkl\n'
1070        '? \t ^ ^  ^\n'
1071        """
1072
1073        # Can hurt, but will probably help most of the time.
1074        common = min(_count_leading(aline, "\t"),
1075                     _count_leading(bline, "\t"))
1076        common = min(common, _count_leading(atags[:common], " "))
1077        common = min(common, _count_leading(btags[:common], " "))
1078        atags = atags[common:].rstrip()
1079        btags = btags[common:].rstrip()
1080
1081        yield "- " + aline
1082        if atags:
1083            yield "? %s%s\n" % ("\t" * common, atags)
1084
1085        yield "+ " + bline
1086        if btags:
1087            yield "? %s%s\n" % ("\t" * common, btags)
1088
1089# With respect to junk, an earlier version of ndiff simply refused to
1090# *start* a match with a junk element.  The result was cases like this:
1091#     before: private Thread currentThread;
1092#     after:  private volatile Thread currentThread;
1093# If you consider whitespace to be junk, the longest contiguous match
1094# not starting with junk is "e Thread currentThread".  So ndiff reported
1095# that "e volatil" was inserted between the 't' and the 'e' in "private".
1096# While an accurate view, to people that's absurd.  The current version
1097# looks for matching blocks that are entirely junk-free, then extends the
1098# longest one of those as far as possible but only with matching junk.
1099# So now "currentThread" is matched, then extended to suck up the
1100# preceding blank; then "private" is matched, and extended to suck up the
1101# following blank; then "Thread" is matched; and finally ndiff reports
1102# that "volatile " was inserted before "Thread".  The only quibble
1103# remaining is that perhaps it was really the case that " volatile"
1104# was inserted after "private".  I can live with that <wink>.
1105
1106import re
1107
1108def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
1109    r"""
1110    Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
1111
1112    Examples:
1113
1114    >>> IS_LINE_JUNK('\n')
1115    True
1116    >>> IS_LINE_JUNK('  #   \n')
1117    True
1118    >>> IS_LINE_JUNK('hello\n')
1119    False
1120    """
1121
1122    return pat(line) is not None
1123
1124def IS_CHARACTER_JUNK(ch, ws=" \t"):
1125    r"""
1126    Return 1 for ignorable character: iff `ch` is a space or tab.
1127
1128    Examples:
1129
1130    >>> IS_CHARACTER_JUNK(' ')
1131    True
1132    >>> IS_CHARACTER_JUNK('\t')
1133    True
1134    >>> IS_CHARACTER_JUNK('\n')
1135    False
1136    >>> IS_CHARACTER_JUNK('x')
1137    False
1138    """
1139
1140    return ch in ws
1141
1142
1143########################################################################
1144###  Unified Diff
1145########################################################################
1146
1147def _format_range_unified(start, stop):
1148    'Convert range to the "ed" format'
1149    # Per the diff spec at http://www.unix.org/single_unix_specification/
1150    beginning = start + 1     # lines start numbering with one
1151    length = stop - start
1152    if length == 1:
1153        return '{}'.format(beginning)
1154    if not length:
1155        beginning -= 1        # empty ranges begin at line just before the range
1156    return '{},{}'.format(beginning, length)
1157
1158def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1159                 tofiledate='', n=3, lineterm='\n'):
1160    r"""
1161    Compare two sequences of lines; generate the delta as a unified diff.
1162
1163    Unified diffs are a compact way of showing line changes and a few
1164    lines of context.  The number of context lines is set by 'n' which
1165    defaults to three.
1166
1167    By default, the diff control lines (those with ---, +++, or @@) are
1168    created with a trailing newline.  This is helpful so that inputs
1169    created from file.readlines() result in diffs that are suitable for
1170    file.writelines() since both the inputs and outputs have trailing
1171    newlines.
1172
1173    For inputs that do not have trailing newlines, set the lineterm
1174    argument to "" so that the output will be uniformly newline free.
1175
1176    The unidiff format normally has a header for filenames and modification
1177    times.  Any or all of these may be specified using strings for
1178    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1179    The modification times are normally expressed in the ISO 8601 format.
1180
1181    Example:
1182
1183    >>> for line in unified_diff('one two three four'.split(),
1184    ...             'zero one tree four'.split(), 'Original', 'Current',
1185    ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
1186    ...             lineterm=''):
1187    ...     print line                  # doctest: +NORMALIZE_WHITESPACE
1188    --- Original        2005-01-26 23:30:50
1189    +++ Current         2010-04-02 10:20:52
1190    @@ -1,4 +1,4 @@
1191    +zero
1192     one
1193    -two
1194    -three
1195    +tree
1196     four
1197    """
1198
1199    started = False
1200    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1201        if not started:
1202            started = True
1203            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1204            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1205            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
1206            yield '+++ {}{}{}'.format(tofile, todate, lineterm)
1207
1208        first, last = group[0], group[-1]
1209        file1_range = _format_range_unified(first[1], last[2])
1210        file2_range = _format_range_unified(first[3], last[4])
1211        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
1212
1213        for tag, i1, i2, j1, j2 in group:
1214            if tag == 'equal':
1215                for line in a[i1:i2]:
1216                    yield ' ' + line
1217                continue
1218            if tag in ('replace', 'delete'):
1219                for line in a[i1:i2]:
1220                    yield '-' + line
1221            if tag in ('replace', 'insert'):
1222                for line in b[j1:j2]:
1223                    yield '+' + line
1224
1225
1226########################################################################
1227###  Context Diff
1228########################################################################
1229
1230def _format_range_context(start, stop):
1231    'Convert range to the "ed" format'
1232    # Per the diff spec at http://www.unix.org/single_unix_specification/
1233    beginning = start + 1     # lines start numbering with one
1234    length = stop - start
1235    if not length:
1236        beginning -= 1        # empty ranges begin at line just before the range
1237    if length <= 1:
1238        return '{}'.format(beginning)
1239    return '{},{}'.format(beginning, beginning + length - 1)
1240
1241# See http://www.unix.org/single_unix_specification/
1242def context_diff(a, b, fromfile='', tofile='',
1243                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1244    r"""
1245    Compare two sequences of lines; generate the delta as a context diff.
1246
1247    Context diffs are a compact way of showing line changes and a few
1248    lines of context.  The number of context lines is set by 'n' which
1249    defaults to three.
1250
1251    By default, the diff control lines (those with *** or ---) are
1252    created with a trailing newline.  This is helpful so that inputs
1253    created from file.readlines() result in diffs that are suitable for
1254    file.writelines() since both the inputs and outputs have trailing
1255    newlines.
1256
1257    For inputs that do not have trailing newlines, set the lineterm
1258    argument to "" so that the output will be uniformly newline free.
1259
1260    The context diff format normally has a header for filenames and
1261    modification times.  Any or all of these may be specified using
1262    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1263    The modification times are normally expressed in the ISO 8601 format.
1264    If not specified, the strings default to blanks.
1265
1266    Example:
1267
1268    >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
1269    ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')),
1270    *** Original
1271    --- Current
1272    ***************
1273    *** 1,4 ****
1274      one
1275    ! two
1276    ! three
1277      four
1278    --- 1,4 ----
1279    + zero
1280      one
1281    ! tree
1282      four
1283    """
1284
1285    prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
1286    started = False
1287    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1288        if not started:
1289            started = True
1290            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1291            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1292            yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
1293            yield '--- {}{}{}'.format(tofile, todate, lineterm)
1294
1295        first, last = group[0], group[-1]
1296        yield '***************' + lineterm
1297
1298        file1_range = _format_range_context(first[1], last[2])
1299        yield '*** {} ****{}'.format(file1_range, lineterm)
1300
1301        if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group):
1302            for tag, i1, i2, _, _ in group:
1303                if tag != 'insert':
1304                    for line in a[i1:i2]:
1305                        yield prefix[tag] + line
1306
1307        file2_range = _format_range_context(first[3], last[4])
1308        yield '--- {} ----{}'.format(file2_range, lineterm)
1309
1310        if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group):
1311            for tag, _, _, j1, j2 in group:
1312                if tag != 'delete':
1313                    for line in b[j1:j2]:
1314                        yield prefix[tag] + line
1315
1316def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1317    r"""
1318    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1319
1320    Optional keyword parameters `linejunk` and `charjunk` are for filter
1321    functions (or None):
1322
1323    - linejunk: A function that should accept a single string argument, and
1324      return true iff the string is junk.  The default is None, and is
1325      recommended; as of Python 2.3, an adaptive notion of "noise" lines is
1326      used that does a good job on its own.
1327
1328    - charjunk: A function that should accept a string of length 1. The
1329      default is module-level function IS_CHARACTER_JUNK, which filters out
1330      whitespace characters (a blank or tab; note: bad idea to include newline
1331      in this!).
1332
1333    Tools/scripts/ndiff.py is a command-line front-end to this function.
1334
1335    Example:
1336
1337    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1338    ...              'ore\ntree\nemu\n'.splitlines(1))
1339    >>> print ''.join(diff),
1340    - one
1341    ?  ^
1342    + ore
1343    ?  ^
1344    - two
1345    - three
1346    ?  -
1347    + tree
1348    + emu
1349    """
1350    return Differ(linejunk, charjunk).compare(a, b)
1351
1352def _mdiff(fromlines, tolines, context=None, linejunk=None,
1353           charjunk=IS_CHARACTER_JUNK):
1354    r"""Returns generator yielding marked up from/to side by side differences.
1355
1356    Arguments:
1357    fromlines -- list of text lines to compared to tolines
1358    tolines -- list of text lines to be compared to fromlines
1359    context -- number of context lines to display on each side of difference,
1360               if None, all from/to text lines will be generated.
1361    linejunk -- passed on to ndiff (see ndiff documentation)
1362    charjunk -- passed on to ndiff (see ndiff documentation)
1363
1364    This function returns an interator which returns a tuple:
1365    (from line tuple, to line tuple, boolean flag)
1366
1367    from/to line tuple -- (line num, line text)
1368        line num -- integer or None (to indicate a context separation)
1369        line text -- original line text with following markers inserted:
1370            '\0+' -- marks start of added text
1371            '\0-' -- marks start of deleted text
1372            '\0^' -- marks start of changed text
1373            '\1' -- marks end of added/deleted/changed text
1374
1375    boolean flag -- None indicates context separation, True indicates
1376        either "from" or "to" line contains a change, otherwise False.
1377
1378    This function/iterator was originally developed to generate side by side
1379    file difference for making HTML pages (see HtmlDiff class for example
1380    usage).
1381
1382    Note, this function utilizes the ndiff function to generate the side by
1383    side difference markup.  Optional ndiff arguments may be passed to this
1384    function and they in turn will be passed to ndiff.
1385    """
1386    import re
1387
1388    # regular expression for finding intraline change indices
1389    change_re = re.compile('(\++|\-+|\^+)')
1390
1391    # create the difference iterator to generate the differences
1392    diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1393
1394    def _make_line(lines, format_key, side, num_lines=[0,0]):
1395        """Returns line of text with user's change markup and line formatting.
1396
1397        lines -- list of lines from the ndiff generator to produce a line of
1398                 text from.  When producing the line of text to return, the
1399                 lines used are removed from this list.
1400        format_key -- '+' return first line in list with "add" markup around
1401                          the entire line.
1402                      '-' return first line in list with "delete" markup around
1403                          the entire line.
1404                      '?' return first line in list with add/delete/change
1405                          intraline markup (indices obtained from second line)
1406                      None return first line in list with no markup
1407        side -- indice into the num_lines list (0=from,1=to)
1408        num_lines -- from/to current line number.  This is NOT intended to be a
1409                     passed parameter.  It is present as a keyword argument to
1410                     maintain memory of the current line numbers between calls
1411                     of this function.
1412
1413        Note, this function is purposefully not defined at the module scope so
1414        that data it needs from its parent function (within whose context it
1415        is defined) does not need to be of module scope.
1416        """
1417        num_lines[side] += 1
1418        # Handle case where no user markup is to be added, just return line of
1419        # text with user's line format to allow for usage of the line number.
1420        if format_key is None:
1421            return (num_lines[side],lines.pop(0)[2:])
1422        # Handle case of intraline changes
1423        if format_key == '?':
1424            text, markers = lines.pop(0), lines.pop(0)
1425            # find intraline changes (store change type and indices in tuples)
1426            sub_info = []
1427            def record_sub_info(match_object,sub_info=sub_info):
1428                sub_info.append([match_object.group(1)[0],match_object.span()])
1429                return match_object.group(1)
1430            change_re.sub(record_sub_info,markers)
1431            # process each tuple inserting our special marks that won't be
1432            # noticed by an xml/html escaper.
1433            for key,(begin,end) in sub_info[::-1]:
1434                text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1435            text = text[2:]
1436        # Handle case of add/delete entire line
1437        else:
1438            text = lines.pop(0)[2:]
1439            # if line of text is just a newline, insert a space so there is
1440            # something for the user to highlight and see.
1441            if not text:
1442                text = ' '
1443            # insert marks that won't be noticed by an xml/html escaper.
1444            text = '\0' + format_key + text + '\1'
1445        # Return line of text, first allow user's line formatter to do its
1446        # thing (such as adding the line number) then replace the special
1447        # marks with what the user's change markup.
1448        return (num_lines[side],text)
1449
1450    def _line_iterator():
1451        """Yields from/to lines of text with a change indication.
1452
1453        This function is an iterator.  It itself pulls lines from a
1454        differencing iterator, processes them and yields them.  When it can
1455        it yields both a "from" and a "to" line, otherwise it will yield one
1456        or the other.  In addition to yielding the lines of from/to text, a
1457        boolean flag is yielded to indicate if the text line(s) have
1458        differences in them.
1459
1460        Note, this function is purposefully not defined at the module scope so
1461        that data it needs from its parent function (within whose context it
1462        is defined) does not need to be of module scope.
1463        """
1464        lines = []
1465        num_blanks_pending, num_blanks_to_yield = 0, 0
1466        while True:
1467            # Load up next 4 lines so we can look ahead, create strings which
1468            # are a concatenation of the first character of each of the 4 lines
1469            # so we can do some very readable comparisons.
1470            while len(lines) < 4:
1471                try:
1472                    lines.append(diff_lines_iterator.next())
1473                except StopIteration:
1474                    lines.append('X')
1475            s = ''.join([line[0] for line in lines])
1476            if s.startswith('X'):
1477                # When no more lines, pump out any remaining blank lines so the
1478                # corresponding add/delete lines get a matching blank line so
1479                # all line pairs get yielded at the next level.
1480                num_blanks_to_yield = num_blanks_pending
1481            elif s.startswith('-?+?'):
1482                # simple intraline change
1483                yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1484                continue
1485            elif s.startswith('--++'):
1486                # in delete block, add block coming: we do NOT want to get
1487                # caught up on blank lines yet, just process the delete line
1488                num_blanks_pending -= 1
1489                yield _make_line(lines,'-',0), None, True
1490                continue
1491            elif s.startswith(('--?+', '--+', '- ')):
1492                # in delete block and see a intraline change or unchanged line
1493                # coming: yield the delete line and then blanks
1494                from_line,to_line = _make_line(lines,'-',0), None
1495                num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1496            elif s.startswith('-+?'):
1497                # intraline change
1498                yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1499                continue
1500            elif s.startswith('-?+'):
1501                # intraline change
1502                yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1503                continue
1504            elif s.startswith('-'):
1505                # delete FROM line
1506                num_blanks_pending -= 1
1507                yield _make_line(lines,'-',0), None, True
1508                continue
1509            elif s.startswith('+--'):
1510                # in add block, delete block coming: we do NOT want to get
1511                # caught up on blank lines yet, just process the add line
1512                num_blanks_pending += 1
1513                yield None, _make_line(lines,'+',1), True
1514                continue
1515            elif s.startswith(('+ ', '+-')):
1516                # will be leaving an add block: yield blanks then add line
1517                from_line, to_line = None, _make_line(lines,'+',1)
1518                num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1519            elif s.startswith('+'):
1520                # inside an add block, yield the add line
1521                num_blanks_pending += 1
1522                yield None, _make_line(lines,'+',1), True
1523                continue
1524            elif s.startswith(' '):
1525                # unchanged text, yield it to both sides
1526                yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1527                continue
1528            # Catch up on the blank lines so when we yield the next from/to
1529            # pair, they are lined up.
1530            while(num_blanks_to_yield < 0):
1531                num_blanks_to_yield += 1
1532                yield None,('','\n'),True
1533            while(num_blanks_to_yield > 0):
1534                num_blanks_to_yield -= 1
1535                yield ('','\n'),None,True
1536            if s.startswith('X'):
1537                raise StopIteration
1538            else:
1539                yield from_line,to_line,True
1540
1541    def _line_pair_iterator():
1542        """Yields from/to lines of text with a change indication.
1543
1544        This function is an iterator.  It itself pulls lines from the line
1545        iterator.  Its difference from that iterator is that this function
1546        always yields a pair of from/to text lines (with the change
1547        indication).  If necessary it will collect single from/to lines
1548        until it has a matching pair from/to pair to yield.
1549
1550        Note, this function is purposefully not defined at the module scope so
1551        that data it needs from its parent function (within whose context it
1552        is defined) does not need to be of module scope.
1553        """
1554        line_iterator = _line_iterator()
1555        fromlines,tolines=[],[]
1556        while True:
1557            # Collecting lines of text until we have a from/to pair
1558            while (len(fromlines)==0 or len(tolines)==0):
1559                from_line, to_line, found_diff =line_iterator.next()
1560                if from_line is not None:
1561                    fromlines.append((from_line,found_diff))
1562                if to_line is not None:
1563                    tolines.append((to_line,found_diff))
1564            # Once we have a pair, remove them from the collection and yield it
1565            from_line, fromDiff = fromlines.pop(0)
1566            to_line, to_diff = tolines.pop(0)
1567            yield (from_line,to_line,fromDiff or to_diff)
1568
1569    # Handle case where user does not want context differencing, just yield
1570    # them up without doing anything else with them.
1571    line_pair_iterator = _line_pair_iterator()
1572    if context is None:
1573        while True:
1574            yield line_pair_iterator.next()
1575    # Handle case where user wants context differencing.  We must do some
1576    # storage of lines until we know for sure that they are to be yielded.
1577    else:
1578        context += 1
1579        lines_to_write = 0
1580        while True:
1581            # Store lines up until we find a difference, note use of a
1582            # circular queue because we only need to keep around what
1583            # we need for context.
1584            index, contextLines = 0, [None]*(context)
1585            found_diff = False
1586            while(found_diff is False):
1587                from_line, to_line, found_diff = line_pair_iterator.next()
1588                i = index % context
1589                contextLines[i] = (from_line, to_line, found_diff)
1590                index += 1
1591            # Yield lines that we have collected so far, but first yield
1592            # the user's separator.
1593            if index > context:
1594                yield None, None, None
1595                lines_to_write = context
1596            else:
1597                lines_to_write = index
1598                index = 0
1599            while(lines_to_write):
1600                i = index % context
1601                index += 1
1602                yield contextLines[i]
1603                lines_to_write -= 1
1604            # Now yield the context lines after the change
1605            lines_to_write = context-1
1606            while(lines_to_write):
1607                from_line, to_line, found_diff = line_pair_iterator.next()
1608                # If another change within the context, extend the context
1609                if found_diff:
1610                    lines_to_write = context-1
1611                else:
1612                    lines_to_write -= 1
1613                yield from_line, to_line, found_diff
1614
1615
1616_file_template = """
1617<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1618          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1619
1620<html>
1621
1622<head>
1623    <meta http-equiv="Content-Type"
1624          content="text/html; charset=ISO-8859-1" />
1625    <title></title>
1626    <style type="text/css">%(styles)s
1627    </style>
1628</head>
1629
1630<body>
1631    %(table)s%(legend)s
1632</body>
1633
1634</html>"""
1635
1636_styles = """
1637        table.diff {font-family:Courier; border:medium;}
1638        .diff_header {background-color:#e0e0e0}
1639        td.diff_header {text-align:right}
1640        .diff_next {background-color:#c0c0c0}
1641        .diff_add {background-color:#aaffaa}
1642        .diff_chg {background-color:#ffff77}
1643        .diff_sub {background-color:#ffaaaa}"""
1644
1645_table_template = """
1646    <table class="diff" id="difflib_chg_%(prefix)s_top"
1647           cellspacing="0" cellpadding="0" rules="groups" >
1648        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1649        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1650        %(header_row)s
1651        <tbody>
1652%(data_rows)s        </tbody>
1653    </table>"""
1654
1655_legend = """
1656    <table class="diff" summary="Legends">
1657        <tr> <th colspan="2"> Legends </th> </tr>
1658        <tr> <td> <table border="" summary="Colors">
1659                      <tr><th> Colors </th> </tr>
1660                      <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1661                      <tr><td class="diff_chg">Changed</td> </tr>
1662                      <tr><td class="diff_sub">Deleted</td> </tr>
1663                  </table></td>
1664             <td> <table border="" summary="Links">
1665                      <tr><th colspan="2"> Links </th> </tr>
1666                      <tr><td>(f)irst change</td> </tr>
1667                      <tr><td>(n)ext change</td> </tr>
1668                      <tr><td>(t)op</td> </tr>
1669                  </table></td> </tr>
1670    </table>"""
1671
1672class HtmlDiff(object):
1673    """For producing HTML side by side comparison with change highlights.
1674
1675    This class can be used to create an HTML table (or a complete HTML file
1676    containing the table) showing a side by side, line by line comparison
1677    of text with inter-line and intra-line change highlights.  The table can
1678    be generated in either full or contextual difference mode.
1679
1680    The following methods are provided for HTML generation:
1681
1682    make_table -- generates HTML for a single side by side table
1683    make_file -- generates complete HTML file with a single side by side table
1684
1685    See tools/scripts/diff.py for an example usage of this class.
1686    """
1687
1688    _file_template = _file_template
1689    _styles = _styles
1690    _table_template = _table_template
1691    _legend = _legend
1692    _default_prefix = 0
1693
1694    def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1695                 charjunk=IS_CHARACTER_JUNK):
1696        """HtmlDiff instance initializer
1697
1698        Arguments:
1699        tabsize -- tab stop spacing, defaults to 8.
1700        wrapcolumn -- column number where lines are broken and wrapped,
1701            defaults to None where lines are not wrapped.
1702        linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
1703            HtmlDiff() to generate the side by side HTML differences).  See
1704            ndiff() documentation for argument default values and descriptions.
1705        """
1706        self._tabsize = tabsize
1707        self._wrapcolumn = wrapcolumn
1708        self._linejunk = linejunk
1709        self._charjunk = charjunk
1710
1711    def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1712                  numlines=5):
1713        """Returns HTML file of side by side comparison with change highlights
1714
1715        Arguments:
1716        fromlines -- list of "from" lines
1717        tolines -- list of "to" lines
1718        fromdesc -- "from" file column header string
1719        todesc -- "to" file column header string
1720        context -- set to True for contextual differences (defaults to False
1721            which shows full differences).
1722        numlines -- number of context lines.  When context is set True,
1723            controls number of lines displayed before and after the change.
1724            When context is False, controls the number of lines to place
1725            the "next" link anchors before the next change (so click of
1726            "next" link jumps to just before the change).
1727        """
1728
1729        return self._file_template % dict(
1730            styles = self._styles,
1731            legend = self._legend,
1732            table = self.make_table(fromlines,tolines,fromdesc,todesc,
1733                                    context=context,numlines=numlines))
1734
1735    def _tab_newline_replace(self,fromlines,tolines):
1736        """Returns from/to line lists with tabs expanded and newlines removed.
1737
1738        Instead of tab characters being replaced by the number of spaces
1739        needed to fill in to the next tab stop, this function will fill
1740        the space with tab characters.  This is done so that the difference
1741        algorithms can identify changes in a file when tabs are replaced by
1742        spaces and vice versa.  At the end of the HTML generation, the tab
1743        characters will be replaced with a nonbreakable space.
1744        """
1745        def expand_tabs(line):
1746            # hide real spaces
1747            line = line.replace(' ','\0')
1748            # expand tabs into spaces
1749            line = line.expandtabs(self._tabsize)
1750            # replace spaces from expanded tabs back into tab characters
1751            # (we'll replace them with markup after we do differencing)
1752            line = line.replace(' ','\t')
1753            return line.replace('\0',' ').rstrip('\n')
1754        fromlines = [expand_tabs(line) for line in fromlines]
1755        tolines = [expand_tabs(line) for line in tolines]
1756        return fromlines,tolines
1757
1758    def _split_line(self,data_list,line_num,text):
1759        """Builds list of text lines by splitting text lines at wrap point
1760
1761        This function will determine if the input text line needs to be
1762        wrapped (split) into separate lines.  If so, the first wrap point
1763        will be determined and the first line appended to the output
1764        text line list.  This function is used recursively to handle
1765        the second part of the split line to further split it.
1766        """
1767        # if blank line or context separator, just add it to the output list
1768        if not line_num:
1769            data_list.append((line_num,text))
1770            return
1771
1772        # if line text doesn't need wrapping, just add it to the output list
1773        size = len(text)
1774        max = self._wrapcolumn
1775        if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1776            data_list.append((line_num,text))
1777            return
1778
1779        # scan text looking for the wrap point, keeping track if the wrap
1780        # point is inside markers
1781        i = 0
1782        n = 0
1783        mark = ''
1784        while n < max and i < size:
1785            if text[i] == '\0':
1786                i += 1
1787                mark = text[i]
1788                i += 1
1789            elif text[i] == '\1':
1790                i += 1
1791                mark = ''
1792            else:
1793                i += 1
1794                n += 1
1795
1796        # wrap point is inside text, break it up into separate lines
1797        line1 = text[:i]
1798        line2 = text[i:]
1799
1800        # if wrap point is inside markers, place end marker at end of first
1801        # line and start marker at beginning of second line because each
1802        # line will have its own table tag markup around it.
1803        if mark:
1804            line1 = line1 + '\1'
1805            line2 = '\0' + mark + line2
1806
1807        # tack on first line onto the output list
1808        data_list.append((line_num,line1))
1809
1810        # use this routine again to wrap the remaining text
1811        self._split_line(data_list,'>',line2)
1812
1813    def _line_wrapper(self,diffs):
1814        """Returns iterator that splits (wraps) mdiff text lines"""
1815
1816        # pull from/to data and flags from mdiff iterator
1817        for fromdata,todata,flag in diffs:
1818            # check for context separators and pass them through
1819            if flag is None:
1820                yield fromdata,todata,flag
1821                continue
1822            (fromline,fromtext),(toline,totext) = fromdata,todata
1823            # for each from/to line split it at the wrap column to form
1824            # list of text lines.
1825            fromlist,tolist = [],[]
1826            self._split_line(fromlist,fromline,fromtext)
1827            self._split_line(tolist,toline,totext)
1828            # yield from/to line in pairs inserting blank lines as
1829            # necessary when one side has more wrapped lines
1830            while fromlist or tolist:
1831                if fromlist:
1832                    fromdata = fromlist.pop(0)
1833                else:
1834                    fromdata = ('',' ')
1835                if tolist:
1836                    todata = tolist.pop(0)
1837                else:
1838                    todata = ('',' ')
1839                yield fromdata,todata,flag
1840
1841    def _collect_lines(self,diffs):
1842        """Collects mdiff output into separate lists
1843
1844        Before storing the mdiff from/to data into a list, it is converted
1845        into a single line of text with HTML markup.
1846        """
1847
1848        fromlist,tolist,flaglist = [],[],[]
1849        # pull from/to data and flags from mdiff style iterator
1850        for fromdata,todata,flag in diffs:
1851            try:
1852                # store HTML markup of the lines into the lists
1853                fromlist.append(self._format_line(0,flag,*fromdata))
1854                tolist.append(self._format_line(1,flag,*todata))
1855            except TypeError:
1856                # exceptions occur for lines where context separators go
1857                fromlist.append(None)
1858                tolist.append(None)
1859            flaglist.append(flag)
1860        return fromlist,tolist,flaglist
1861
1862    def _format_line(self,side,flag,linenum,text):
1863        """Returns HTML markup of "from" / "to" text lines
1864
1865        side -- 0 or 1 indicating "from" or "to" text
1866        flag -- indicates if difference on line
1867        linenum -- line number (used for line number column)
1868        text -- line text to be marked up
1869        """
1870        try:
1871            linenum = '%d' % linenum
1872            id = ' id="%s%s"' % (self._prefix[side],linenum)
1873        except TypeError:
1874            # handle blank lines where linenum is '>' or ''
1875            id = ''
1876        # replace those things that would get confused with HTML symbols
1877        text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1878
1879        # make space non-breakable so they don't get compressed or line wrapped
1880        text = text.replace(' ','&nbsp;').rstrip()
1881
1882        return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1883               % (id,linenum,text)
1884
1885    def _make_prefix(self):
1886        """Create unique anchor prefixes"""
1887
1888        # Generate a unique anchor prefix so multiple tables
1889        # can exist on the same HTML page without conflicts.
1890        fromprefix = "from%d_" % HtmlDiff._default_prefix
1891        toprefix = "to%d_" % HtmlDiff._default_prefix
1892        HtmlDiff._default_prefix += 1
1893        # store prefixes so line format method has access
1894        self._prefix = [fromprefix,toprefix]
1895
1896    def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1897        """Makes list of "next" links"""
1898
1899        # all anchor names will be generated using the unique "to" prefix
1900        toprefix = self._prefix[1]
1901
1902        # process change flags, generating middle column of next anchors/links
1903        next_id = ['']*len(flaglist)
1904        next_href = ['']*len(flaglist)
1905        num_chg, in_change = 0, False
1906        last = 0
1907        for i,flag in enumerate(flaglist):
1908            if flag:
1909                if not in_change:
1910                    in_change = True
1911                    last = i
1912                    # at the beginning of a change, drop an anchor a few lines
1913                    # (the context lines) before the change for the previous
1914                    # link
1915                    i = max([0,i-numlines])
1916                    next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1917                    # at the beginning of a change, drop a link to the next
1918                    # change
1919                    num_chg += 1
1920                    next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1921                         toprefix,num_chg)
1922            else:
1923                in_change = False
1924        # check for cases where there is no content to avoid exceptions
1925        if not flaglist:
1926            flaglist = [False]
1927            next_id = ['']
1928            next_href = ['']
1929            last = 0
1930            if context:
1931                fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1932                tolist = fromlist
1933            else:
1934                fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1935        # if not a change on first line, drop a link
1936        if not flaglist[0]:
1937            next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1938        # redo the last link to link to the top
1939        next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1940
1941        return fromlist,tolist,flaglist,next_href,next_id
1942
1943    def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1944                   numlines=5):
1945        """Returns HTML table of side by side comparison with change highlights
1946
1947        Arguments:
1948        fromlines -- list of "from" lines
1949        tolines -- list of "to" lines
1950        fromdesc -- "from" file column header string
1951        todesc -- "to" file column header string
1952        context -- set to True for contextual differences (defaults to False
1953            which shows full differences).
1954        numlines -- number of context lines.  When context is set True,
1955            controls number of lines displayed before and after the change.
1956            When context is False, controls the number of lines to place
1957            the "next" link anchors before the next change (so click of
1958            "next" link jumps to just before the change).
1959        """
1960
1961        # make unique anchor prefixes so that multiple tables may exist
1962        # on the same page without conflict.
1963        self._make_prefix()
1964
1965        # change tabs to spaces before it gets more difficult after we insert
1966        # markkup
1967        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
1968
1969        # create diffs iterator which generates side by side from/to data
1970        if context:
1971            context_lines = numlines
1972        else:
1973            context_lines = None
1974        diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
1975                      charjunk=self._charjunk)
1976
1977        # set up iterator to wrap lines that exceed desired width
1978        if self._wrapcolumn:
1979            diffs = self._line_wrapper(diffs)
1980
1981        # collect up from/to lines and flags into lists (also format the lines)
1982        fromlist,tolist,flaglist = self._collect_lines(diffs)
1983
1984        # process change flags, generating middle column of next anchors/links
1985        fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
1986            fromlist,tolist,flaglist,context,numlines)
1987
1988        s = []
1989        fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
1990              '<td class="diff_next">%s</td>%s</tr>\n'
1991        for i in range(len(flaglist)):
1992            if flaglist[i] is None:
1993                # mdiff yields None on separator lines skip the bogus ones
1994                # generated for the first line
1995                if i > 0:
1996                    s.append('        </tbody>        \n        <tbody>\n')
1997            else:
1998                s.append( fmt % (next_id[i],next_href[i],fromlist[i],
1999                                           next_href[i],tolist[i]))
2000        if fromdesc or todesc:
2001            header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
2002                '<th class="diff_next"><br /></th>',
2003                '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
2004                '<th class="diff_next"><br /></th>',
2005                '<th colspan="2" class="diff_header">%s</th>' % todesc)
2006        else:
2007            header_row = ''
2008
2009        table = self._table_template % dict(
2010            data_rows=''.join(s),
2011            header_row=header_row,
2012            prefix=self._prefix[1])
2013
2014        return table.replace('\0+','<span class="diff_add">'). \
2015                     replace('\0-','<span class="diff_sub">'). \
2016                     replace('\0^','<span class="diff_chg">'). \
2017                     replace('\1','</span>'). \
2018                     replace('\t','&nbsp;')
2019
2020del re
2021
2022def restore(delta, which):
2023    r"""
2024    Generate one of the two sequences that generated a delta.
2025
2026    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
2027    lines originating from file 1 or 2 (parameter `which`), stripping off line
2028    prefixes.
2029
2030    Examples:
2031
2032    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
2033    ...              'ore\ntree\nemu\n'.splitlines(1))
2034    >>> diff = list(diff)
2035    >>> print ''.join(restore(diff, 1)),
2036    one
2037    two
2038    three
2039    >>> print ''.join(restore(diff, 2)),
2040    ore
2041    tree
2042    emu
2043    """
2044    try:
2045        tag = {1: "- ", 2: "+ "}[int(which)]
2046    except KeyError:
2047        raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
2048                           % which)
2049    prefixes = ("  ", tag)
2050    for line in delta:
2051        if line[:2] in prefixes:
2052            yield line[2:]
2053
2054def _test():
2055    import doctest, difflib
2056    return doctest.testmod(difflib)
2057
2058if __name__ == "__main__":
2059    _test()
2060