1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#! /usr/bin/env python
2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""
4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehModule difflib -- helpers for computing deltas between objects.
5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehFunction get_close_matches(word, possibilities, n=3, cutoff=0.6):
7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Use SequenceMatcher to return list of the best "good enough" matches.
8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehFunction context_diff(a, b):
10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    For two lists of strings, return a delta in context diff format.
11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehFunction ndiff(a, b):
13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Return a delta: the difference between `a` and `b` (lists of strings).
14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehFunction restore(delta, which):
16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Return one of the two sequences that generated an ndiff delta.
17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehFunction unified_diff(a, b):
19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    For two lists of strings, return a delta in unified diff format.
20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehClass SequenceMatcher:
22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    A flexible class for comparing pairs of sequences of any type.
23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehClass Differ:
25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    For producing human-readable deltas from sequences of lines of text.
26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehClass HtmlDiff:
28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    For producing HTML side by side comparison with change highlights.
29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""
30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           'unified_diff', 'HtmlDiff', 'Match']
34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport heapq
36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom collections import namedtuple as _namedtuple
37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom functools import reduce
38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehMatch = _namedtuple('Match', 'a b size')
40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _calculate_ratio(matches, length):
42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if length:
43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return 2.0 * matches / length
44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return 1.0
45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass SequenceMatcher:
47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    SequenceMatcher is a flexible class for comparing pairs of sequences of
50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    any type, so long as the sequence elements are hashable.  The basic
51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    algorithm predates, and is a little fancier than, an algorithm
52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    published in the late 1980's by Ratcliff and Obershelp under the
53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    hyperbolic name "gestalt pattern matching".  The basic idea is to find
54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    the longest contiguous matching subsequence that contains no "junk"
55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    elements (R-O doesn't address junk).  The same idea is then applied
56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    recursively to the pieces of the sequences to the left and to the right
57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    of the matching subsequence.  This does not yield minimal edit
58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    sequences, but does tend to yield matches that "look right" to people.
59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    SequenceMatcher tries to compute a "human-friendly diff" between two
61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    longest *contiguous* & junk-free matching subsequence.  That's what
63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    catches peoples' eyes.  The Windows(tm) windiff has another interesting
64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    notion, pairing up elements that appear uniquely in each sequence.
65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    That, and the method here, appear to yield more intuitive difference
66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    reports than does diff.  This method appears to be the least vulnerable
67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    to synching up on blocks of "junk lines", though (like blank lines in
68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ordinary text files, or maybe "<P>" lines in HTML files).  That may be
69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    because this is the only method of the 3 that has a *concept* of
70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    "junk" <wink>.
71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Example, comparing two strings, and considering blanks to be "junk":
73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> s = SequenceMatcher(lambda x: x == " ",
75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...                     "private Thread currentThread;",
76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...                     "private volatile Thread currentThread;")
77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>>
78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    .ratio() returns a float in [0, 1], measuring the "similarity" of the
80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    sequences are close matches:
82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> print round(s.ratio(), 3)
84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    0.866
85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>>
86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    If you're only interested in where the sequences match,
88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    .get_matching_blocks() is handy:
89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> for block in s.get_matching_blocks():
91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...     print "a[%d] and b[%d] match for %d elements" % block
92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    a[0] and b[0] match for 8 elements
93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    a[8] and b[17] match for 21 elements
94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    a[29] and b[38] match for 0 elements
95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Note that the last tuple returned by .get_matching_blocks() is always a
97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    dummy, (len(a), len(b), 0), and this is the only case in which the last
98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    tuple element (number of elements matched) is 0.
99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    If you want to know how to change the first sequence into the second,
101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    use .get_opcodes():
102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> for opcode in s.get_opcodes():
104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     equal a[0:8] b[0:8]
106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    insert a[8:8] b[8:17]
107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     equal a[8:29] b[17:38]
108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    See the Differ class for a fancy human-friendly file differencer, which
110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    uses SequenceMatcher both to compare sequences of lines, and to compare
111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    sequences of characters within similar (near-matching) lines.
112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    See also function get_close_matches() in this module, which shows how
114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    simple code building on SequenceMatcher can be used to do useful work.
115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Timing:  Basic R-O is cubic time worst case and quadratic time expected
117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    case.  SequenceMatcher is quadratic time for the worst case and has
118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    expected-case behavior dependent in a complicated way on how many
119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    elements the sequences have in common; best case time is linear.
120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Methods:
122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    __init__(isjunk=None, a='', b='')
124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Construct a SequenceMatcher.
125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    set_seqs(a, b)
127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Set the two sequences to be compared.
128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    set_seq1(a)
130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Set the first sequence to be compared.
131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    set_seq2(b)
133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Set the second sequence to be compared.
134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    find_longest_match(alo, ahi, blo, bhi)
136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Find longest matching block in a[alo:ahi] and b[blo:bhi].
137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    get_matching_blocks()
139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return list of triples describing matching subsequences.
140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    get_opcodes()
142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return list of 5-tuples describing how to turn a into b.
143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ratio()
145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return a measure of the sequences' similarity (float in [0,1]).
146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    quick_ratio()
148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return an upper bound on .ratio() relatively quickly.
149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    real_quick_ratio()
151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return an upper bound on ratio() very quickly.
152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, isjunk=None, a='', b='', autojunk=True):
155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Construct a SequenceMatcher.
156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Optional arg isjunk is None (the default), or a one-argument
158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        function that takes a sequence element and returns true iff the
159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        element is junk.  None is equivalent to passing "lambda x: 0", i.e.
160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        no elements are considered to be junk.  For example, pass
161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            lambda x: x in " \\t"
162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if you're comparing lines as sequences of characters, and don't
163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        want to synch up on blanks or hard tabs.
164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Optional arg a is the first of two sequences to be compared.  By
166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        default, an empty string.  The elements of a must be hashable.  See
167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        also .set_seqs() and .set_seq1().
168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Optional arg b is the second of two sequences to be compared.  By
170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        default, an empty string.  The elements of b must be hashable. See
171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        also .set_seqs() and .set_seq2().
172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Optional arg autojunk should be set to False to disable the
174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        "automatic junk heuristic" that treats popular elements as junk
175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        (see module documentation for more information).
176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Members:
179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # a
180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      first sequence
181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # b
182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      second sequence; differences are computed as "what do
183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      we need to do to 'a' to change it into 'b'?"
184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # b2j
185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      for x in b, b2j[x] is a list of the indices (into b)
186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      at which x appears; junk elements do not appear
187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # fullbcount
188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      for x in b, fullbcount[x] == the number of times x
189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      appears in b; only materialized if really needed (used
190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      only for computing quick_ratio())
191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # matching_blocks
192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      ascending & non-overlapping in i and in j; terminated by
194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      a dummy (len(a), len(b), 0) sentinel
195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # opcodes
196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      one of
198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #          'delete'    a[i1:i2] should be deleted
200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #          'insert'    b[j1:j2] should be inserted
201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #          'equal'     a[i1:i2] == b[j1:j2]
202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # isjunk
203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      a user-supplied function taking a sequence element and
204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      returning true iff the element is "junk" -- this has
205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      subtle but helpful effects on the algorithm, which I'll
206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      get around to writing up someday <0.9 wink>.
207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # isbjunk
209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      for x in b, isbjunk(x) == isjunk(x) but much faster;
210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      it's really the __contains__ method of a hidden dict.
211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      DOES NOT WORK for x in a!
212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # isbpopular
213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      for x in b, isbpopular(x) is true iff b is reasonably long
214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      (at least 200 elements) and x accounts for more than 1 + 1% of
215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      its elements (when autojunk is enabled).
216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #      DOES NOT WORK for x in a!
217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.isjunk = isjunk
219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.a = self.b = None
220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.autojunk = autojunk
221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.set_seqs(a, b)
222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def set_seqs(self, a, b):
224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Set the two sequences to be compared.
225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher()
227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.set_seqs("abcd", "bcde")
228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.ratio()
229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        0.75
230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.set_seq1(a)
233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.set_seq2(b)
234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def set_seq1(self, a):
236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Set the first sequence to be compared.
237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        The second sequence to be compared is not changed.
239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, "abcd", "bcde")
241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.ratio()
242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        0.75
243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.set_seq1("bcde")
244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.ratio()
245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        1.0
246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>>
247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        SequenceMatcher computes and caches detailed information about the
249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        second sequence, so if you want to compare one sequence S against
250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        many sequences, use .set_seq2(S) once and call .set_seq1(x)
251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        repeatedly for each of the other sequences.
252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        See also set_seqs() and set_seq2().
254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if a is self.a:
257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return
258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.a = a
259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.matching_blocks = self.opcodes = None
260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def set_seq2(self, b):
262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Set the second sequence to be compared.
263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        The first sequence to be compared is not changed.
265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, "abcd", "bcde")
267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.ratio()
268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        0.75
269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.set_seq2("abcd")
270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.ratio()
271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        1.0
272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>>
273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        SequenceMatcher computes and caches detailed information about the
275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        second sequence, so if you want to compare one sequence S against
276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        many sequences, use .set_seq2(S) once and call .set_seq1(x)
277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        repeatedly for each of the other sequences.
278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        See also set_seqs() and set_seq1().
280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if b is self.b:
283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return
284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.b = b
285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.matching_blocks = self.opcodes = None
286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.fullbcount = None
287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.__chain_b()
288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # For each element x in b, set b2j[x] to a list of the indices in
290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # b where x appears; the indices are in increasing order; note that
291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # the number of times x appears in b is len(b2j[x]) ...
292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # when self.isjunk is defined, junk elements don't show up in this
293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # map at all, which stops the central find_longest_match method
294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # from starting any matching block at a junk element ...
295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # also creates the fast isbjunk function ...
296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # b2j also does not contain entries for "popular" elements, meaning
297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # elements that account for more than 1 + 1% of the total elements, and
298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # when the sequence is reasonably large (>= 200 elements); this can
299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # be viewed as an adaptive notion of semi-junk, and yields an enormous
300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # speedup when, e.g., comparing program files with hundreds of
301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # instances of "return NULL;" ...
302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # note that this is only called when b changes; so for cross-product
303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # kinds of matches, it's best to call set_seq2 once, then set_seq1
304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # repeatedly
305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __chain_b(self):
307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Because isjunk is a user-defined (not C) function, and we test
308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # for junk a LOT, it's important to minimize the number of calls.
309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Before the tricks described here, __chain_b was by far the most
310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # time-consuming routine in the whole module!  If anyone sees
311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Jim Roskind, thank him again for profile.py -- I never would
312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # have guessed that.
313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # The first trick is to build b2j ignoring the possibility
314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # out the junk later is much cheaper than building b2j "right"
316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # from the start.
317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        b = self.b
318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.b2j = b2j = {}
319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i, elt in enumerate(b):
321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            indices = b2j.setdefault(elt, [])
322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            indices.append(i)
323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Purge junk elements
325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        junk = set()
326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        isjunk = self.isjunk
327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if isjunk:
328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for elt in list(b2j.keys()):  # using list() since b2j is modified
329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if isjunk(elt):
330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    junk.add(elt)
331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    del b2j[elt]
332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Purge popular elements that are not junk
334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        popular = set()
335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        n = len(b)
336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.autojunk and n >= 200:
337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            ntest = n // 100 + 1
338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for elt, idxs in list(b2j.items()):
339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if len(idxs) > ntest:
340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    popular.add(elt)
341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    del b2j[elt]
342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Sicne the number of *unique* junk elements is probably small, the
345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # memory burden of keeping this set alive is likely trivial compared to
346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # the size of b2j.
347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.isbjunk = junk.__contains__
348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.isbpopular = popular.__contains__
349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def find_longest_match(self, alo, ahi, blo, bhi):
351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
352ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
353ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        If isjunk is not defined:
354ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
355ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
356ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            alo <= i <= i+k <= ahi
357ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            blo <= j <= j+k <= bhi
358ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        and for all (i',j',k') meeting those conditions,
359ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            k >= k'
360ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            i <= i'
361ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            and if i == i', j <= j'
362ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
363ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        In other words, of all maximal matching blocks, return one that
364ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        starts earliest in a, and of all those maximal matching blocks that
365ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        start earliest in a, return the one that starts earliest in b.
366ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
367ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
368ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.find_longest_match(0, 5, 0, 9)
369ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Match(a=0, b=4, size=5)
370ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
371ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        If isjunk is defined, first the longest matching block is
372ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        determined as above, but with the additional restriction that no
373ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        junk element appears in the block.  Then that block is extended as
374ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        far as possible by matching (only) junk elements on both sides.  So
375ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        the resulting block never matches on junk except as identical junk
376ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        happens to be adjacent to an "interesting" match.
377ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
378ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Here's the same example as before, but considering blanks to be
379ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        junk.  That prevents " abcd" from matching the " abcd" at the tail
380ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        end of the second sequence directly.  Instead only the "abcd" can
381ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        match, and matches the leftmost "abcd" in the second sequence:
382ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
383ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
384ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.find_longest_match(0, 5, 0, 9)
385ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Match(a=1, b=0, size=4)
386ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
387ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        If no blocks match, return (alo, blo, 0).
388ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
389ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, "ab", "c")
390ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.find_longest_match(0, 2, 0, 1)
391ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Match(a=0, b=0, size=0)
392ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
393ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
394ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # CAUTION:  stripping common prefix or suffix would be incorrect.
395ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # E.g.,
396ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #    ab
397ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #    acab
398ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Longest matching block is "ab", but if common prefix is
399ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
400ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # strip, so ends up claiming that ab is changed to acab by
401ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # inserting "ca" in the middle.  That's minimal but unintuitive:
402ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # "it's obvious" that someone inserted "ac" at the front.
403ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Windiff ends up at the same place as diff, but by pairing up
404ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # the unique 'b's and then matching the first two 'a's.
405ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
406ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
407ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        besti, bestj, bestsize = alo, blo, 0
408ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # find longest junk-free match
409ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # during an iteration of the loop, j2len[j] = length of longest
410ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # junk-free match ending with a[i-1] and b[j]
411ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        j2len = {}
412ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        nothing = []
413ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i in xrange(alo, ahi):
414ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # look at all instances of a[i] in b; note that because
415ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # b2j has no junk keys, the loop is skipped if a[i] is junk
416ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            j2lenget = j2len.get
417ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            newj2len = {}
418ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for j in b2j.get(a[i], nothing):
419ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # a[i] matches b[j]
420ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if j < blo:
421ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    continue
422ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if j >= bhi:
423ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    break
424ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                k = newj2len[j] = j2lenget(j-1, 0) + 1
425ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if k > bestsize:
426ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    besti, bestj, bestsize = i-k+1, j-k+1, k
427ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            j2len = newj2len
428ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
429ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Extend the best by non-junk elements on each end.  In particular,
430ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # "popular" non-junk elements aren't in b2j, which greatly speeds
431ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # the inner loop above, but also means "the best" match so far
432ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # doesn't contain any junk *or* popular non-junk elements.
433ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while besti > alo and bestj > blo and \
434ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              not isbjunk(b[bestj-1]) and \
435ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              a[besti-1] == b[bestj-1]:
436ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
437ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while besti+bestsize < ahi and bestj+bestsize < bhi and \
438ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              not isbjunk(b[bestj+bestsize]) and \
439ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              a[besti+bestsize] == b[bestj+bestsize]:
440ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            bestsize += 1
441ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
442ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Now that we have a wholly interesting match (albeit possibly
443ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # empty!), we may as well suck up the matching junk on each
444ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # side of it too.  Can't think of a good reason not to, and it
445ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # saves post-processing the (possibly considerable) expense of
446ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # figuring out what to do with it.  In the case of an empty
447ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # interesting match, this is clearly the right thing to do,
448ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # because no other kind of match is possible in the regions.
449ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while besti > alo and bestj > blo and \
450ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              isbjunk(b[bestj-1]) and \
451ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              a[besti-1] == b[bestj-1]:
452ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
453ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while besti+bestsize < ahi and bestj+bestsize < bhi and \
454ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              isbjunk(b[bestj+bestsize]) and \
455ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              a[besti+bestsize] == b[bestj+bestsize]:
456ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            bestsize = bestsize + 1
457ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
458ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return Match(besti, bestj, bestsize)
459ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
460ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def get_matching_blocks(self):
461ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Return list of triples describing matching subsequences.
462ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
463ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Each triple is of the form (i, j, n), and means that
464ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
465ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        i and in j.  New in Python 2.5, it's also guaranteed that if
466ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        (i, j, n) and (i', j', n') are adjacent triples in the list, and
467ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        the second is not the last triple in the list, then i+n != i' or
468ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        j+n != j'.  IOW, adjacent triples never describe adjacent equal
469ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        blocks.
470ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
471ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        The last triple is a dummy, (len(a), len(b), 0), and is the only
472ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        triple with n==0.
473ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
474ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, "abxcd", "abcd")
475ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.get_matching_blocks()
476ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
477ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
478ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
479ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.matching_blocks is not None:
480ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return self.matching_blocks
481ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        la, lb = len(self.a), len(self.b)
482ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
483ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # This is most naturally expressed as a recursive algorithm, but
484ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # at least one user bumped into extreme use cases that exceeded
485ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # the recursion limit on their box.  So, now we maintain a list
486ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # ('queue`) of blocks we still need to look at, and append partial
487ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # results to `matching_blocks` in a loop; the matches are sorted
488ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # at the end.
489ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        queue = [(0, la, 0, lb)]
490ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        matching_blocks = []
491ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while queue:
492ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            alo, ahi, blo, bhi = queue.pop()
493ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
494ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # a[alo:i] vs b[blo:j] unknown
495ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # a[i:i+k] same as b[j:j+k]
496ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # a[i+k:ahi] vs b[j+k:bhi] unknown
497ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if k:   # if k is 0, there was no matching block
498ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                matching_blocks.append(x)
499ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if alo < i and blo < j:
500ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    queue.append((alo, i, blo, j))
501ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if i+k < ahi and j+k < bhi:
502ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    queue.append((i+k, ahi, j+k, bhi))
503ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        matching_blocks.sort()
504ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
505ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # It's possible that we have adjacent equal blocks in the
506ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # matching_blocks list now.  Starting with 2.5, this code was added
507ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # to collapse them.
508ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        i1 = j1 = k1 = 0
509ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        non_adjacent = []
510ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i2, j2, k2 in matching_blocks:
511ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Is this block adjacent to i1, j1, k1?
512ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if i1 + k1 == i2 and j1 + k1 == j2:
513ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # Yes, so collapse them -- this just increases the length of
514ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # the first block by the length of the second, and the first
515ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # block so lengthened remains the block to compare against.
516ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                k1 += k2
517ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
518ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # Not adjacent.  Remember the first block (k1==0 means it's
519ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # the dummy we started with), and make the second block the
520ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # new block to compare against.
521ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if k1:
522ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    non_adjacent.append((i1, j1, k1))
523ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i1, j1, k1 = i2, j2, k2
524ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if k1:
525ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            non_adjacent.append((i1, j1, k1))
526ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
527ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        non_adjacent.append( (la, lb, 0) )
528ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.matching_blocks = non_adjacent
529ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return map(Match._make, self.matching_blocks)
530ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
531ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def get_opcodes(self):
532ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Return list of 5-tuples describing how to turn a into b.
533ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
534ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
535ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
536ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        tuple preceding it, and likewise for j1 == the previous j2.
537ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
538ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        The tags are strings, with these meanings:
539ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
540ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        'replace':  a[i1:i2] should be replaced by b[j1:j2]
541ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        'delete':   a[i1:i2] should be deleted.
542ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    Note that j1==j2 in this case.
543ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        'insert':   b[j1:j2] should be inserted at a[i1:i1].
544ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    Note that i1==i2 in this case.
545ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        'equal':    a[i1:i2] == b[j1:j2]
546ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
547ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> a = "qabxcd"
548ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> b = "abycdf"
549ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, a, b)
550ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
551ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
552ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
553ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh         delete a[0:1] (q) b[0:0] ()
554ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          equal a[1:3] (ab) b[0:2] (ab)
555ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        replace a[3:4] (x) b[2:3] (y)
556ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          equal a[4:6] (cd) b[3:5] (cd)
557ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh         insert a[6:6] () b[5:6] (f)
558ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
559ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
560ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.opcodes is not None:
561ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return self.opcodes
562ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        i = j = 0
563ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.opcodes = answer = []
564ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for ai, bj, size in self.get_matching_blocks():
565ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # invariant:  we've pumped out correct diffs to change
566ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # a[:i] into b[:j], and the next matching block is
567ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
568ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # out a diff to change a[i:ai] into b[j:bj], pump out
569ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # the matching block, and move (i,j) beyond the match
570ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            tag = ''
571ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if i < ai and j < bj:
572ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tag = 'replace'
573ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif i < ai:
574ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tag = 'delete'
575ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif j < bj:
576ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tag = 'insert'
577ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tag:
578ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                answer.append( (tag, i, ai, j, bj) )
579ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            i, j = ai+size, bj+size
580ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # the list of matching blocks is terminated by a
581ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # sentinel with size 0
582ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if size:
583ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                answer.append( ('equal', ai, i, bj, j) )
584ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return answer
585ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
586ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def get_grouped_opcodes(self, n=3):
587ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """ Isolate change clusters by eliminating ranges with no changes.
588ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
589ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Return a generator of groups with upto n lines of context.
590ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Each group is in the same format as returned by get_opcodes().
591ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
592ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> from pprint import pprint
593ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> a = map(str, range(1,40))
594ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> b = a[:]
595ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> b[8:8] = ['i']     # Make an insertion
596ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> b[20] += 'x'       # Make a replacement
597ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> b[23:28] = []      # Make a deletion
598ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> b[30] += 'y'       # Make another replacement
599ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
600ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
601ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh         [('equal', 16, 19, 17, 20),
602ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          ('replace', 19, 20, 20, 21),
603ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          ('equal', 20, 22, 21, 23),
604ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          ('delete', 22, 27, 23, 23),
605ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          ('equal', 27, 30, 23, 26)],
606ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh         [('equal', 31, 34, 27, 30),
607ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          ('replace', 34, 35, 30, 31),
608ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          ('equal', 35, 38, 31, 34)]]
609ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
610ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
611ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codes = self.get_opcodes()
612ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not codes:
613ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            codes = [("equal", 0, 1, 0, 1)]
614ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Fixup leading and trailing groups if they show no changes.
615ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if codes[0][0] == 'equal':
616ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            tag, i1, i2, j1, j2 = codes[0]
617ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
618ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if codes[-1][0] == 'equal':
619ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            tag, i1, i2, j1, j2 = codes[-1]
620ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
621ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
622ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        nn = n + n
623ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        group = []
624ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for tag, i1, i2, j1, j2 in codes:
625ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # End the current group and start a new one whenever
626ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # there is a large range with no changes.
627ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tag == 'equal' and i2-i1 > nn:
628ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
629ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield group
630ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                group = []
631ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i1, j1 = max(i1, i2-n), max(j1, j2-n)
632ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            group.append((tag, i1, i2, j1 ,j2))
633ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if group and not (len(group)==1 and group[0][0] == 'equal'):
634ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield group
635ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
636ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def ratio(self):
637ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Return a measure of the sequences' similarity (float in [0,1]).
638ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
639ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Where T is the total number of elements in both sequences, and
640ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        M is the number of matches, this is 2.0*M / T.
641ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Note that this is 1 if the sequences are identical, and 0 if
642ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        they have nothing in common.
643ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
644ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .ratio() is expensive to compute if you haven't already computed
645ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .get_matching_blocks() or .get_opcodes(), in which case you may
646ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        want to try .quick_ratio() or .real_quick_ratio() first to get an
647ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        upper bound.
648ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
649ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s = SequenceMatcher(None, "abcd", "bcde")
650ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.ratio()
651ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        0.75
652ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.quick_ratio()
653ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        0.75
654ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> s.real_quick_ratio()
655ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        1.0
656ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
657ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
658ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        matches = reduce(lambda sum, triple: sum + triple[-1],
659ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                         self.get_matching_blocks(), 0)
660ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return _calculate_ratio(matches, len(self.a) + len(self.b))
661ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
662ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def quick_ratio(self):
663ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Return an upper bound on ratio() relatively quickly.
664ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
665ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        This isn't defined beyond that it is an upper bound on .ratio(), and
666ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        is faster to compute.
667ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
668ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
669ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # viewing a and b as multisets, set matches to the cardinality
670ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # of their intersection; this counts the number of matches
671ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # without regard to order, so is clearly an upper bound
672ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.fullbcount is None:
673ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.fullbcount = fullbcount = {}
674ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for elt in self.b:
675ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                fullbcount[elt] = fullbcount.get(elt, 0) + 1
676ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fullbcount = self.fullbcount
677ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # avail[x] is the number of times x appears in 'b' less the
678ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # number of times we've seen it in 'a' so far ... kinda
679ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        avail = {}
680ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        availhas, matches = avail.__contains__, 0
681ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for elt in self.a:
682ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if availhas(elt):
683ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                numb = avail[elt]
684ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
685ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                numb = fullbcount.get(elt, 0)
686ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            avail[elt] = numb - 1
687ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if numb > 0:
688ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                matches = matches + 1
689ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return _calculate_ratio(matches, len(self.a) + len(self.b))
690ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
691ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def real_quick_ratio(self):
692ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Return an upper bound on ratio() very quickly.
693ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
694ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        This isn't defined beyond that it is an upper bound on .ratio(), and
695ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        is faster to compute than either .ratio() or .quick_ratio().
696ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
697ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
698ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        la, lb = len(self.a), len(self.b)
699ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # can't have more matches than the number of elements in the
700ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # shorter sequence
701ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return _calculate_ratio(min(la, lb), la + lb)
702ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
703ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef get_close_matches(word, possibilities, n=3, cutoff=0.6):
704ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """Use SequenceMatcher to return list of the best "good enough" matches.
705ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
706ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    word is a sequence for which close matches are desired (typically a
707ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    string).
708ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
709ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    possibilities is a list of sequences against which to match word
710ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    (typically a list of strings).
711ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
712ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Optional arg n (default 3) is the maximum number of close matches to
713ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return.  n must be > 0.
714ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
715ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
716ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    that don't score at least that similar to word are ignored.
717ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
718ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The best (no more than n) matches among the possibilities are returned
719ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    in a list, sorted by similarity score, most similar first.
720ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
721ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
722ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ['apple', 'ape']
723ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> import keyword as _keyword
724ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> get_close_matches("wheel", _keyword.kwlist)
725ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ['while']
726ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> get_close_matches("apple", _keyword.kwlist)
727ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    []
728ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> get_close_matches("accept", _keyword.kwlist)
729ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ['except']
730ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
731ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
732ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not n >  0:
733ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise ValueError("n must be > 0: %r" % (n,))
734ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not 0.0 <= cutoff <= 1.0:
735ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
736ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    result = []
737ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    s = SequenceMatcher()
738ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    s.set_seq2(word)
739ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for x in possibilities:
740ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        s.set_seq1(x)
741ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if s.real_quick_ratio() >= cutoff and \
742ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           s.quick_ratio() >= cutoff and \
743ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           s.ratio() >= cutoff:
744ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            result.append((s.ratio(), x))
745ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
746ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Move the best scorers to head of list
747ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    result = heapq.nlargest(n, result)
748ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Strip scores for the best n matches
749ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return [x for score, x in result]
750ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
751ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _count_leading(line, ch):
752ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
753ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Return number of `ch` characters at the start of `line`.
754ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
755ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Example:
756ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
757ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> _count_leading('   abc', ' ')
758ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    3
759ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
760ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
761ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    i, n = 0, len(line)
762ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    while i < n and line[i] == ch:
763ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        i += 1
764ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return i
765ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
766ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass Differ:
767ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
768ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Differ is a class for comparing sequences of lines of text, and
769ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    producing human-readable differences or deltas.  Differ uses
770ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    SequenceMatcher both to compare sequences of lines, and to compare
771ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    sequences of characters within similar (near-matching) lines.
772ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
773ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Each line of a Differ delta begins with a two-letter code:
774ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
775ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '- '    line unique to sequence 1
776ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '+ '    line unique to sequence 2
777ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '  '    line common to both sequences
778ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '? '    line not present in either input sequence
779ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
780ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Lines beginning with '? ' attempt to guide the eye to intraline
781ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    differences, and were not present in either input sequence.  These lines
782ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    can be confusing if the sequences contain tab characters.
783ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
784ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Note that Differ makes no claim to produce a *minimal* diff.  To the
785ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    contrary, minimal diffs are often counter-intuitive, because they synch
786ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    up anywhere possible, sometimes accidental matches 100 pages apart.
787ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Restricting synch points to contiguous matches preserves some notion of
788ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    locality, at the occasional cost of producing a longer diff.
789ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
790ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Example: Comparing two texts.
791ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
792ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    First we set up the texts, sequences of individual single-line strings
793ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ending with newlines (such sequences can also be obtained from the
794ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    `readlines()` method of file-like objects):
795ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
796ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> text1 = '''  1. Beautiful is better than ugly.
797ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...   2. Explicit is better than implicit.
798ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...   3. Simple is better than complex.
799ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...   4. Complex is better than complicated.
800ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ... '''.splitlines(1)
801ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> len(text1)
802ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    4
803ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> text1[0][-1]
804ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    '\n'
805ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> text2 = '''  1. Beautiful is better than ugly.
806ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...   3.   Simple is better than complex.
807ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...   4. Complicated is better than complex.
808ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...   5. Flat is better than nested.
809ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ... '''.splitlines(1)
810ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
811ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Next we instantiate a Differ object:
812ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
813ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> d = Differ()
814ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
815ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Note that when instantiating a Differ object we may pass functions to
816ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    filter out line and character 'junk'.  See Differ.__init__ for details.
817ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
818ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Finally, we compare the two:
819ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
820ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> result = list(d.compare(text1, text2))
821ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
822ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    'result' is a list of strings, so let's pretty-print it:
823ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
824ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> from pprint import pprint as _pprint
825ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> _pprint(result)
826ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ['    1. Beautiful is better than ugly.\n',
827ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '-   2. Explicit is better than implicit.\n',
828ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '-   3. Simple is better than complex.\n',
829ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '+   3.   Simple is better than complex.\n',
830ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '?     ++\n',
831ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '-   4. Complex is better than complicated.\n',
832ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '?            ^                     ---- ^\n',
833ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '+   4. Complicated is better than complex.\n',
834ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '?           ++++ ^                      ^\n',
835ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     '+   5. Flat is better than nested.\n']
836ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
837ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    As a single multi-line string it looks like this:
838ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
839ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> print ''.join(result),
840ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        1. Beautiful is better than ugly.
841ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    -   2. Explicit is better than implicit.
842ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    -   3. Simple is better than complex.
843ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    +   3.   Simple is better than complex.
844ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ?     ++
845ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    -   4. Complex is better than complicated.
846ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ?            ^                     ---- ^
847ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    +   4. Complicated is better than complex.
848ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ?           ++++ ^                      ^
849ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    +   5. Flat is better than nested.
850ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
851ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Methods:
852ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
853ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    __init__(linejunk=None, charjunk=None)
854ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Construct a text differencer, with optional filters.
855ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
856ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    compare(a, b)
857ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Compare two sequences of lines; generate the resulting delta.
858ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
859ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
860ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, linejunk=None, charjunk=None):
861ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
862ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Construct a text differencer, with optional filters.
863ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
864ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        The two optional keyword parameters are for filter functions:
865ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
866ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - `linejunk`: A function that should accept a single string argument,
867ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          and return true iff the string is junk. The module-level function
868ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          `IS_LINE_JUNK` may be used to filter out lines without visible
869ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          characters, except for at most one splat ('#').  It is recommended
870ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          to leave linejunk None; as of Python 2.3, the underlying
871ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          SequenceMatcher class has grown an adaptive notion of "noise" lines
872ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          that's better than any static definition the author has ever been
873ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          able to craft.
874ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
875ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - `charjunk`: A function that should accept a string of length 1. The
876ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          module-level function `IS_CHARACTER_JUNK` may be used to filter out
877ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          whitespace characters (a blank or tab; **note**: bad idea to include
878ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
879ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
880ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
881ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.linejunk = linejunk
882ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.charjunk = charjunk
883ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
884ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def compare(self, a, b):
885ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        r"""
886ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Compare two sequences of lines; generate the resulting delta.
887ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
888ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Each sequence must contain individual single-line strings ending with
889ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        newlines. Such sequences can be obtained from the `readlines()` method
890ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        of file-like objects.  The delta generated also consists of newline-
891ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        terminated strings, ready to be printed as-is via the writeline()
892ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        method of a file-like object.
893ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
894ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Example:
895ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
896ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
897ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ...                                'ore\ntree\nemu\n'.splitlines(1))),
898ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - one
899ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ?  ^
900ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        + ore
901ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ?  ^
902ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - two
903ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - three
904ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ?  -
905ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        + tree
906ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        + emu
907ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
908ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
909ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        cruncher = SequenceMatcher(self.linejunk, a, b)
910ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
911ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tag == 'replace':
912ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
913ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif tag == 'delete':
914ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                g = self._dump('-', a, alo, ahi)
915ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif tag == 'insert':
916ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                g = self._dump('+', b, blo, bhi)
917ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif tag == 'equal':
918ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                g = self._dump(' ', a, alo, ahi)
919ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
920ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                raise ValueError, 'unknown tag %r' % (tag,)
921ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
922ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for line in g:
923ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield line
924ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
925ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _dump(self, tag, x, lo, hi):
926ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Generate comparison results for a same-tagged range."""
927ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i in xrange(lo, hi):
928ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield '%s %s' % (tag, x[i])
929ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
930ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
931ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        assert alo < ahi and blo < bhi
932ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # dump the shorter block first -- reduces the burden on short-term
933ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # memory if the blocks are of very different sizes
934ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if bhi - blo < ahi - alo:
935ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            first  = self._dump('+', b, blo, bhi)
936ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            second = self._dump('-', a, alo, ahi)
937ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
938ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            first  = self._dump('-', a, alo, ahi)
939ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            second = self._dump('+', b, blo, bhi)
940ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
941ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for g in first, second:
942ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for line in g:
943ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield line
944ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
945ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
946ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        r"""
947ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        When replacing one block of lines with another, search the blocks
948ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for *similar* lines; the best-matching pair (if any) is used as a
949ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        synch point, and intraline difference marking is done on the
950ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        similar pair. Lots of work, but often worth it.
951ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
952ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Example:
953ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
954ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> d = Differ()
955ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
956ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ...                            ['abcdefGhijkl\n'], 0, 1)
957ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> print ''.join(results),
958ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - abcDefghiJkl
959ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ?    ^  ^  ^
960ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        + abcdefGhijkl
961ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ?    ^  ^  ^
962ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
963ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
964ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # don't synch up unless the lines have a similarity score of at
965ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # least cutoff; best_ratio tracks the best score seen so far
966ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        best_ratio, cutoff = 0.74, 0.75
967ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        cruncher = SequenceMatcher(self.charjunk)
968ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        eqi, eqj = None, None   # 1st indices of equal lines (if any)
969ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
970ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # search for the pair that matches best without being identical
971ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # (identical lines must be junk lines, & we don't want to synch up
972ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # on junk -- unless we have to)
973ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for j in xrange(blo, bhi):
974ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            bj = b[j]
975ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            cruncher.set_seq2(bj)
976ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for i in xrange(alo, ahi):
977ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                ai = a[i]
978ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if ai == bj:
979ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    if eqi is None:
980ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        eqi, eqj = i, j
981ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    continue
982ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                cruncher.set_seq1(ai)
983ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # computing similarity is expensive, so use the quick
984ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # upper bounds first -- have seen this speed up messy
985ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # compares by a factor of 3.
986ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # note that ratio() is only expensive to compute the first
987ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # time it's called on a sequence pair; the expensive part
988ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # of the computation is cached by cruncher
989ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if cruncher.real_quick_ratio() > best_ratio and \
990ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      cruncher.quick_ratio() > best_ratio and \
991ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      cruncher.ratio() > best_ratio:
992ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    best_ratio, best_i, best_j = cruncher.ratio(), i, j
993ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if best_ratio < cutoff:
994ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # no non-identical "pretty close" pair
995ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if eqi is None:
996ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # no identical pair either -- treat it as a straight replace
997ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
998ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    yield line
999ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return
1000ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # no close pair, but an identical pair -- synch up on that
1001ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            best_i, best_j, best_ratio = eqi, eqj, 1.0
1002ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
1003ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # there's a close pair, so forget the identical pair (if any)
1004ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            eqi = None
1005ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1006ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
1007ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # identical
1008ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1009ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # pump out diffs from before the synch point
1010ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
1011ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield line
1012ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1013ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # do intraline marking on the synch pair
1014ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        aelt, belt = a[best_i], b[best_j]
1015ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if eqi is None:
1016ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # pump out a '-', '?', '+', '?' quad for the synched lines
1017ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            atags = btags = ""
1018ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            cruncher.set_seqs(aelt, belt)
1019ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
1020ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                la, lb = ai2 - ai1, bj2 - bj1
1021ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if tag == 'replace':
1022ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    atags += '^' * la
1023ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    btags += '^' * lb
1024ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif tag == 'delete':
1025ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    atags += '-' * la
1026ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif tag == 'insert':
1027ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    btags += '+' * lb
1028ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif tag == 'equal':
1029ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    atags += ' ' * la
1030ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    btags += ' ' * lb
1031ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
1032ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    raise ValueError, 'unknown tag %r' % (tag,)
1033ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for line in self._qformat(aelt, belt, atags, btags):
1034ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield line
1035ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
1036ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # the synch pair is identical
1037ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield '  ' + aelt
1038ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1039ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # pump out diffs from after the synch point
1040ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
1041ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield line
1042ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1043ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1044ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        g = []
1045ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if alo < ahi:
1046ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if blo < bhi:
1047ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1048ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1049ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                g = self._dump('-', a, alo, ahi)
1050ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif blo < bhi:
1051ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            g = self._dump('+', b, blo, bhi)
1052ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1053ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for line in g:
1054ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield line
1055ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1056ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _qformat(self, aline, bline, atags, btags):
1057ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        r"""
1058ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Format "?" output and deal with leading tabs.
1059ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1060ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Example:
1061ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1062ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> d = Differ()
1063ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
1064ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
1065ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        >>> for line in results: print repr(line)
1066ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ...
1067ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '- \tabcDefghiJkl\n'
1068ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '? \t ^ ^  ^\n'
1069ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '+ \tabcdefGhijkl\n'
1070ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        '? \t ^ ^  ^\n'
1071ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1072ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1073ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Can hurt, but will probably help most of the time.
1074ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        common = min(_count_leading(aline, "\t"),
1075ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     _count_leading(bline, "\t"))
1076ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        common = min(common, _count_leading(atags[:common], " "))
1077ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        common = min(common, _count_leading(btags[:common], " "))
1078ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        atags = atags[common:].rstrip()
1079ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        btags = btags[common:].rstrip()
1080ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1081ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        yield "- " + aline
1082ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if atags:
1083ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield "? %s%s\n" % ("\t" * common, atags)
1084ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1085ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        yield "+ " + bline
1086ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if btags:
1087ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield "? %s%s\n" % ("\t" * common, btags)
1088ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1089ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# With respect to junk, an earlier version of ndiff simply refused to
1090ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# *start* a match with a junk element.  The result was cases like this:
1091ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#     before: private Thread currentThread;
1092ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#     after:  private volatile Thread currentThread;
1093ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# If you consider whitespace to be junk, the longest contiguous match
1094ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# not starting with junk is "e Thread currentThread".  So ndiff reported
1095ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# that "e volatil" was inserted between the 't' and the 'e' in "private".
1096ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# While an accurate view, to people that's absurd.  The current version
1097ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# looks for matching blocks that are entirely junk-free, then extends the
1098ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# longest one of those as far as possible but only with matching junk.
1099ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# So now "currentThread" is matched, then extended to suck up the
1100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# preceding blank; then "private" is matched, and extended to suck up the
1101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# following blank; then "Thread" is matched; and finally ndiff reports
1102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# that "volatile " was inserted before "Thread".  The only quibble
1103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# remaining is that perhaps it was really the case that " volatile"
1104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# was inserted after "private".  I can live with that <wink>.
1105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport re
1107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
1109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
1110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
1111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Examples:
1113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_LINE_JUNK('\n')
1115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    True
1116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_LINE_JUNK('  #   \n')
1117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    True
1118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_LINE_JUNK('hello\n')
1119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    False
1120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return pat(line) is not None
1123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef IS_CHARACTER_JUNK(ch, ws=" \t"):
1125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
1126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Return 1 for ignorable character: iff `ch` is a space or tab.
1127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Examples:
1129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_CHARACTER_JUNK(' ')
1131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    True
1132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_CHARACTER_JUNK('\t')
1133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    True
1134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_CHARACTER_JUNK('\n')
1135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    False
1136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> IS_CHARACTER_JUNK('x')
1137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    False
1138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return ch in ws
1141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh########################################################################
1144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh###  Unified Diff
1145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh########################################################################
1146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _format_range_unified(start, stop):
1148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    'Convert range to the "ed" format'
1149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Per the diff spec at http://www.unix.org/single_unix_specification/
1150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    beginning = start + 1     # lines start numbering with one
1151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    length = stop - start
1152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if length == 1:
1153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return '{}'.format(beginning)
1154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not length:
1155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        beginning -= 1        # empty ranges begin at line just before the range
1156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return '{},{}'.format(beginning, length)
1157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                 tofiledate='', n=3, lineterm='\n'):
1160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
1161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Compare two sequences of lines; generate the delta as a unified diff.
1162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Unified diffs are a compact way of showing line changes and a few
1164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    lines of context.  The number of context lines is set by 'n' which
1165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    defaults to three.
1166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    By default, the diff control lines (those with ---, +++, or @@) are
1168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    created with a trailing newline.  This is helpful so that inputs
1169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    created from file.readlines() result in diffs that are suitable for
1170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    file.writelines() since both the inputs and outputs have trailing
1171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    newlines.
1172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    For inputs that do not have trailing newlines, set the lineterm
1174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    argument to "" so that the output will be uniformly newline free.
1175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The unidiff format normally has a header for filenames and modification
1177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    times.  Any or all of these may be specified using strings for
1178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The modification times are normally expressed in the ISO 8601 format.
1180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Example:
1182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> for line in unified_diff('one two three four'.split(),
1184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...             'zero one tree four'.split(), 'Original', 'Current',
1185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
1186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...             lineterm=''):
1187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...     print line                  # doctest: +NORMALIZE_WHITESPACE
1188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    --- Original        2005-01-26 23:30:50
1189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    +++ Current         2010-04-02 10:20:52
1190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    @@ -1,4 +1,4 @@
1191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    +zero
1192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     one
1193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    -two
1194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    -three
1195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    +tree
1196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh     four
1197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    started = False
1200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not started:
1202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            started = True
1203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
1206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield '+++ {}{}{}'.format(tofile, todate, lineterm)
1207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        first, last = group[0], group[-1]
1209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        file1_range = _format_range_unified(first[1], last[2])
1210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        file2_range = _format_range_unified(first[3], last[4])
1211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
1212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for tag, i1, i2, j1, j2 in group:
1214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tag == 'equal':
1215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for line in a[i1:i2]:
1216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    yield ' ' + line
1217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tag in ('replace', 'delete'):
1219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for line in a[i1:i2]:
1220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    yield '-' + line
1221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tag in ('replace', 'insert'):
1222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for line in b[j1:j2]:
1223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    yield '+' + line
1224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh########################################################################
1227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh###  Context Diff
1228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh########################################################################
1229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _format_range_context(start, stop):
1231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    'Convert range to the "ed" format'
1232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Per the diff spec at http://www.unix.org/single_unix_specification/
1233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    beginning = start + 1     # lines start numbering with one
1234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    length = stop - start
1235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not length:
1236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        beginning -= 1        # empty ranges begin at line just before the range
1237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if length <= 1:
1238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return '{}'.format(beginning)
1239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return '{},{}'.format(beginning, beginning + length - 1)
1240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# See http://www.unix.org/single_unix_specification/
1242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef context_diff(a, b, fromfile='', tofile='',
1243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
1245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Compare two sequences of lines; generate the delta as a context diff.
1246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Context diffs are a compact way of showing line changes and a few
1248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    lines of context.  The number of context lines is set by 'n' which
1249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    defaults to three.
1250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    By default, the diff control lines (those with *** or ---) are
1252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    created with a trailing newline.  This is helpful so that inputs
1253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    created from file.readlines() result in diffs that are suitable for
1254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    file.writelines() since both the inputs and outputs have trailing
1255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    newlines.
1256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    For inputs that do not have trailing newlines, set the lineterm
1258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    argument to "" so that the output will be uniformly newline free.
1259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The context diff format normally has a header for filenames and
1261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    modification times.  Any or all of these may be specified using
1262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The modification times are normally expressed in the ISO 8601 format.
1264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    If not specified, the strings default to blanks.
1265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Example:
1267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
1269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')),
1270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    *** Original
1271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    --- Current
1272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ***************
1273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    *** 1,4 ****
1274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      one
1275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ! two
1276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ! three
1277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      four
1278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    --- 1,4 ----
1279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    + zero
1280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      one
1281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ! tree
1282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      four
1283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
1286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    started = False
1287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not started:
1289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            started = True
1290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
1293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield '--- {}{}{}'.format(tofile, todate, lineterm)
1294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        first, last = group[0], group[-1]
1296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        yield '***************' + lineterm
1297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        file1_range = _format_range_context(first[1], last[2])
1299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        yield '*** {} ****{}'.format(file1_range, lineterm)
1300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group):
1302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for tag, i1, i2, _, _ in group:
1303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if tag != 'insert':
1304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    for line in a[i1:i2]:
1305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        yield prefix[tag] + line
1306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        file2_range = _format_range_context(first[3], last[4])
1308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        yield '--- {} ----{}'.format(file2_range, lineterm)
1309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group):
1311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for tag, _, _, j1, j2 in group:
1312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if tag != 'delete':
1313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    for line in b[j1:j2]:
1314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        yield prefix[tag] + line
1315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
1318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Optional keyword parameters `linejunk` and `charjunk` are for filter
1321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    functions (or None):
1322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    - linejunk: A function that should accept a single string argument, and
1324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      return true iff the string is junk.  The default is None, and is
1325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      recommended; as of Python 2.3, an adaptive notion of "noise" lines is
1326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      used that does a good job on its own.
1327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    - charjunk: A function that should accept a string of length 1. The
1329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      default is module-level function IS_CHARACTER_JUNK, which filters out
1330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      whitespace characters (a blank or tab; note: bad idea to include newline
1331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh      in this!).
1332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Tools/scripts/ndiff.py is a command-line front-end to this function.
1334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Example:
1336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...              'ore\ntree\nemu\n'.splitlines(1))
1339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> print ''.join(diff),
1340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    - one
1341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ?  ^
1342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    + ore
1343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ?  ^
1344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    - two
1345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    - three
1346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ?  -
1347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    + tree
1348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    + emu
1349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return Differ(linejunk, charjunk).compare(a, b)
1351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1352ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _mdiff(fromlines, tolines, context=None, linejunk=None,
1353ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           charjunk=IS_CHARACTER_JUNK):
1354ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""Returns generator yielding marked up from/to side by side differences.
1355ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1356ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Arguments:
1357ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    fromlines -- list of text lines to compared to tolines
1358ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    tolines -- list of text lines to be compared to fromlines
1359ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    context -- number of context lines to display on each side of difference,
1360ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh               if None, all from/to text lines will be generated.
1361ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    linejunk -- passed on to ndiff (see ndiff documentation)
1362ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    charjunk -- passed on to ndiff (see ndiff documentation)
1363ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1364ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    This function returns an interator which returns a tuple:
1365ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    (from line tuple, to line tuple, boolean flag)
1366ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1367ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    from/to line tuple -- (line num, line text)
1368ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        line num -- integer or None (to indicate a context separation)
1369ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        line text -- original line text with following markers inserted:
1370ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            '\0+' -- marks start of added text
1371ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            '\0-' -- marks start of deleted text
1372ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            '\0^' -- marks start of changed text
1373ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            '\1' -- marks end of added/deleted/changed text
1374ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1375ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    boolean flag -- None indicates context separation, True indicates
1376ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        either "from" or "to" line contains a change, otherwise False.
1377ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1378ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    This function/iterator was originally developed to generate side by side
1379ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    file difference for making HTML pages (see HtmlDiff class for example
1380ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    usage).
1381ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1382ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Note, this function utilizes the ndiff function to generate the side by
1383ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    side difference markup.  Optional ndiff arguments may be passed to this
1384ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    function and they in turn will be passed to ndiff.
1385ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1386ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    import re
1387ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1388ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # regular expression for finding intraline change indices
1389ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    change_re = re.compile('(\++|\-+|\^+)')
1390ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1391ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # create the difference iterator to generate the differences
1392ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1393ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1394ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _make_line(lines, format_key, side, num_lines=[0,0]):
1395ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns line of text with user's change markup and line formatting.
1396ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1397ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        lines -- list of lines from the ndiff generator to produce a line of
1398ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                 text from.  When producing the line of text to return, the
1399ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                 lines used are removed from this list.
1400ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        format_key -- '+' return first line in list with "add" markup around
1401ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                          the entire line.
1402ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      '-' return first line in list with "delete" markup around
1403ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                          the entire line.
1404ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      '?' return first line in list with add/delete/change
1405ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                          intraline markup (indices obtained from second line)
1406ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      None return first line in list with no markup
1407ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        side -- indice into the num_lines list (0=from,1=to)
1408ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        num_lines -- from/to current line number.  This is NOT intended to be a
1409ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     passed parameter.  It is present as a keyword argument to
1410ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     maintain memory of the current line numbers between calls
1411ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     of this function.
1412ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1413ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Note, this function is purposefully not defined at the module scope so
1414ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        that data it needs from its parent function (within whose context it
1415ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        is defined) does not need to be of module scope.
1416ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1417ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        num_lines[side] += 1
1418ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Handle case where no user markup is to be added, just return line of
1419ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # text with user's line format to allow for usage of the line number.
1420ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if format_key is None:
1421ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return (num_lines[side],lines.pop(0)[2:])
1422ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Handle case of intraline changes
1423ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if format_key == '?':
1424ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            text, markers = lines.pop(0), lines.pop(0)
1425ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # find intraline changes (store change type and indices in tuples)
1426ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            sub_info = []
1427ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            def record_sub_info(match_object,sub_info=sub_info):
1428ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                sub_info.append([match_object.group(1)[0],match_object.span()])
1429ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return match_object.group(1)
1430ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            change_re.sub(record_sub_info,markers)
1431ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # process each tuple inserting our special marks that won't be
1432ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # noticed by an xml/html escaper.
1433ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for key,(begin,end) in sub_info[::-1]:
1434ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1435ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            text = text[2:]
1436ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Handle case of add/delete entire line
1437ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
1438ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            text = lines.pop(0)[2:]
1439ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # if line of text is just a newline, insert a space so there is
1440ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # something for the user to highlight and see.
1441ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if not text:
1442ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                text = ' '
1443ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # insert marks that won't be noticed by an xml/html escaper.
1444ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            text = '\0' + format_key + text + '\1'
1445ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Return line of text, first allow user's line formatter to do its
1446ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # thing (such as adding the line number) then replace the special
1447ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # marks with what the user's change markup.
1448ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return (num_lines[side],text)
1449ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1450ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _line_iterator():
1451ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Yields from/to lines of text with a change indication.
1452ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1453ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        This function is an iterator.  It itself pulls lines from a
1454ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        differencing iterator, processes them and yields them.  When it can
1455ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        it yields both a "from" and a "to" line, otherwise it will yield one
1456ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        or the other.  In addition to yielding the lines of from/to text, a
1457ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        boolean flag is yielded to indicate if the text line(s) have
1458ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        differences in them.
1459ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1460ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Note, this function is purposefully not defined at the module scope so
1461ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        that data it needs from its parent function (within whose context it
1462ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        is defined) does not need to be of module scope.
1463ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1464ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        lines = []
1465ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        num_blanks_pending, num_blanks_to_yield = 0, 0
1466ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while True:
1467ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Load up next 4 lines so we can look ahead, create strings which
1468ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # are a concatenation of the first character of each of the 4 lines
1469ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # so we can do some very readable comparisons.
1470ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while len(lines) < 4:
1471ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                try:
1472ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    lines.append(diff_lines_iterator.next())
1473ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                except StopIteration:
1474ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    lines.append('X')
1475ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            s = ''.join([line[0] for line in lines])
1476ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if s.startswith('X'):
1477ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # When no more lines, pump out any remaining blank lines so the
1478ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # corresponding add/delete lines get a matching blank line so
1479ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # all line pairs get yielded at the next level.
1480ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_to_yield = num_blanks_pending
1481ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('-?+?'):
1482ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # simple intraline change
1483ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1484ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1485ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('--++'):
1486ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # in delete block, add block coming: we do NOT want to get
1487ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # caught up on blank lines yet, just process the delete line
1488ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_pending -= 1
1489ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield _make_line(lines,'-',0), None, True
1490ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1491ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith(('--?+', '--+', '- ')):
1492ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # in delete block and see a intraline change or unchanged line
1493ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # coming: yield the delete line and then blanks
1494ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                from_line,to_line = _make_line(lines,'-',0), None
1495ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1496ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('-+?'):
1497ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # intraline change
1498ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1499ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1500ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('-?+'):
1501ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # intraline change
1502ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1503ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1504ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('-'):
1505ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # delete FROM line
1506ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_pending -= 1
1507ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield _make_line(lines,'-',0), None, True
1508ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1509ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('+--'):
1510ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # in add block, delete block coming: we do NOT want to get
1511ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # caught up on blank lines yet, just process the add line
1512ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_pending += 1
1513ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield None, _make_line(lines,'+',1), True
1514ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1515ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith(('+ ', '+-')):
1516ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # will be leaving an add block: yield blanks then add line
1517ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                from_line, to_line = None, _make_line(lines,'+',1)
1518ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1519ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith('+'):
1520ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # inside an add block, yield the add line
1521ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_pending += 1
1522ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield None, _make_line(lines,'+',1), True
1523ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1524ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif s.startswith(' '):
1525ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # unchanged text, yield it to both sides
1526ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1527ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1528ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Catch up on the blank lines so when we yield the next from/to
1529ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # pair, they are lined up.
1530ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while(num_blanks_to_yield < 0):
1531ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_to_yield += 1
1532ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield None,('','\n'),True
1533ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while(num_blanks_to_yield > 0):
1534ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                num_blanks_to_yield -= 1
1535ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield ('','\n'),None,True
1536ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if s.startswith('X'):
1537ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                raise StopIteration
1538ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1539ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield from_line,to_line,True
1540ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1541ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _line_pair_iterator():
1542ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Yields from/to lines of text with a change indication.
1543ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1544ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        This function is an iterator.  It itself pulls lines from the line
1545ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        iterator.  Its difference from that iterator is that this function
1546ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        always yields a pair of from/to text lines (with the change
1547ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        indication).  If necessary it will collect single from/to lines
1548ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        until it has a matching pair from/to pair to yield.
1549ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1550ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Note, this function is purposefully not defined at the module scope so
1551ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        that data it needs from its parent function (within whose context it
1552ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        is defined) does not need to be of module scope.
1553ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1554ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        line_iterator = _line_iterator()
1555ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlines,tolines=[],[]
1556ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while True:
1557ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Collecting lines of text until we have a from/to pair
1558ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while (len(fromlines)==0 or len(tolines)==0):
1559ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                from_line, to_line, found_diff =line_iterator.next()
1560ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if from_line is not None:
1561ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    fromlines.append((from_line,found_diff))
1562ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if to_line is not None:
1563ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    tolines.append((to_line,found_diff))
1564ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Once we have a pair, remove them from the collection and yield it
1565ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            from_line, fromDiff = fromlines.pop(0)
1566ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            to_line, to_diff = tolines.pop(0)
1567ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield (from_line,to_line,fromDiff or to_diff)
1568ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1569ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Handle case where user does not want context differencing, just yield
1570ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # them up without doing anything else with them.
1571ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    line_pair_iterator = _line_pair_iterator()
1572ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if context is None:
1573ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while True:
1574ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield line_pair_iterator.next()
1575ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Handle case where user wants context differencing.  We must do some
1576ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # storage of lines until we know for sure that they are to be yielded.
1577ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
1578ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        context += 1
1579ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        lines_to_write = 0
1580ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while True:
1581ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Store lines up until we find a difference, note use of a
1582ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # circular queue because we only need to keep around what
1583ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # we need for context.
1584ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            index, contextLines = 0, [None]*(context)
1585ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            found_diff = False
1586ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while(found_diff is False):
1587ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                from_line, to_line, found_diff = line_pair_iterator.next()
1588ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i = index % context
1589ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                contextLines[i] = (from_line, to_line, found_diff)
1590ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                index += 1
1591ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Yield lines that we have collected so far, but first yield
1592ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # the user's separator.
1593ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if index > context:
1594ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield None, None, None
1595ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                lines_to_write = context
1596ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1597ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                lines_to_write = index
1598ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                index = 0
1599ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while(lines_to_write):
1600ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i = index % context
1601ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                index += 1
1602ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield contextLines[i]
1603ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                lines_to_write -= 1
1604ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # Now yield the context lines after the change
1605ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            lines_to_write = context-1
1606ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while(lines_to_write):
1607ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                from_line, to_line, found_diff = line_pair_iterator.next()
1608ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # If another change within the context, extend the context
1609ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if found_diff:
1610ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    lines_to_write = context-1
1611ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
1612ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    lines_to_write -= 1
1613ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield from_line, to_line, found_diff
1614ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1615ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1616ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_file_template = """
1617ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1618ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1619ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1620ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh<html>
1621ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1622ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh<head>
1623ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    <meta http-equiv="Content-Type"
1624ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh          content="text/html; charset=ISO-8859-1" />
1625ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    <title></title>
1626ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    <style type="text/css">%(styles)s
1627ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    </style>
1628ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh</head>
1629ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1630ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh<body>
1631ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    %(table)s%(legend)s
1632ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh</body>
1633ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1634ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh</html>"""
1635ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1636ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_styles = """
1637ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        table.diff {font-family:Courier; border:medium;}
1638ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .diff_header {background-color:#e0e0e0}
1639ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        td.diff_header {text-align:right}
1640ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .diff_next {background-color:#c0c0c0}
1641ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .diff_add {background-color:#aaffaa}
1642ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .diff_chg {background-color:#ffff77}
1643ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        .diff_sub {background-color:#ffaaaa}"""
1644ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1645ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_table_template = """
1646ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    <table class="diff" id="difflib_chg_%(prefix)s_top"
1647ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           cellspacing="0" cellpadding="0" rules="groups" >
1648ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1649ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1650ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        %(header_row)s
1651ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        <tbody>
1652ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh%(data_rows)s        </tbody>
1653ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    </table>"""
1654ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1655ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_legend = """
1656ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    <table class="diff" summary="Legends">
1657ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        <tr> <th colspan="2"> Legends </th> </tr>
1658ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        <tr> <td> <table border="" summary="Colors">
1659ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><th> Colors </th> </tr>
1660ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1661ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><td class="diff_chg">Changed</td> </tr>
1662ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><td class="diff_sub">Deleted</td> </tr>
1663ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                  </table></td>
1664ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh             <td> <table border="" summary="Links">
1665ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><th colspan="2"> Links </th> </tr>
1666ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><td>(f)irst change</td> </tr>
1667ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><td>(n)ext change</td> </tr>
1668ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      <tr><td>(t)op</td> </tr>
1669ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                  </table></td> </tr>
1670ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    </table>"""
1671ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1672ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass HtmlDiff(object):
1673ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """For producing HTML side by side comparison with change highlights.
1674ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1675ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    This class can be used to create an HTML table (or a complete HTML file
1676ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    containing the table) showing a side by side, line by line comparison
1677ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    of text with inter-line and intra-line change highlights.  The table can
1678ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    be generated in either full or contextual difference mode.
1679ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1680ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The following methods are provided for HTML generation:
1681ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1682ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    make_table -- generates HTML for a single side by side table
1683ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    make_file -- generates complete HTML file with a single side by side table
1684ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1685ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    See tools/scripts/diff.py for an example usage of this class.
1686ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
1687ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1688ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _file_template = _file_template
1689ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _styles = _styles
1690ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _table_template = _table_template
1691ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _legend = _legend
1692ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _default_prefix = 0
1693ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1694ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1695ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                 charjunk=IS_CHARACTER_JUNK):
1696ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """HtmlDiff instance initializer
1697ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1698ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Arguments:
1699ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        tabsize -- tab stop spacing, defaults to 8.
1700ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        wrapcolumn -- column number where lines are broken and wrapped,
1701ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            defaults to None where lines are not wrapped.
1702ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
1703ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            HtmlDiff() to generate the side by side HTML differences).  See
1704ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            ndiff() documentation for argument default values and descriptions.
1705ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1706ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._tabsize = tabsize
1707ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._wrapcolumn = wrapcolumn
1708ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._linejunk = linejunk
1709ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._charjunk = charjunk
1710ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1711ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1712ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                  numlines=5):
1713ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns HTML file of side by side comparison with change highlights
1714ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1715ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Arguments:
1716ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlines -- list of "from" lines
1717ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        tolines -- list of "to" lines
1718ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromdesc -- "from" file column header string
1719ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        todesc -- "to" file column header string
1720ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        context -- set to True for contextual differences (defaults to False
1721ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            which shows full differences).
1722ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        numlines -- number of context lines.  When context is set True,
1723ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            controls number of lines displayed before and after the change.
1724ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            When context is False, controls the number of lines to place
1725ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            the "next" link anchors before the next change (so click of
1726ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            "next" link jumps to just before the change).
1727ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1728ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1729ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return self._file_template % dict(
1730ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            styles = self._styles,
1731ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            legend = self._legend,
1732ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            table = self.make_table(fromlines,tolines,fromdesc,todesc,
1733ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                                    context=context,numlines=numlines))
1734ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1735ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _tab_newline_replace(self,fromlines,tolines):
1736ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns from/to line lists with tabs expanded and newlines removed.
1737ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1738ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Instead of tab characters being replaced by the number of spaces
1739ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        needed to fill in to the next tab stop, this function will fill
1740ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        the space with tab characters.  This is done so that the difference
1741ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        algorithms can identify changes in a file when tabs are replaced by
1742ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        spaces and vice versa.  At the end of the HTML generation, the tab
1743ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        characters will be replaced with a nonbreakable space.
1744ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1745ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        def expand_tabs(line):
1746ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # hide real spaces
1747ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line = line.replace(' ','\0')
1748ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # expand tabs into spaces
1749ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line = line.expandtabs(self._tabsize)
1750ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # replace spaces from expanded tabs back into tab characters
1751ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # (we'll replace them with markup after we do differencing)
1752ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line = line.replace(' ','\t')
1753ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return line.replace('\0',' ').rstrip('\n')
1754ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlines = [expand_tabs(line) for line in fromlines]
1755ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        tolines = [expand_tabs(line) for line in tolines]
1756ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return fromlines,tolines
1757ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1758ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _split_line(self,data_list,line_num,text):
1759ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Builds list of text lines by splitting text lines at wrap point
1760ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1761ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        This function will determine if the input text line needs to be
1762ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        wrapped (split) into separate lines.  If so, the first wrap point
1763ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        will be determined and the first line appended to the output
1764ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        text line list.  This function is used recursively to handle
1765ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        the second part of the split line to further split it.
1766ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1767ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # if blank line or context separator, just add it to the output list
1768ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not line_num:
1769ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            data_list.append((line_num,text))
1770ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return
1771ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1772ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # if line text doesn't need wrapping, just add it to the output list
1773ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        size = len(text)
1774ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        max = self._wrapcolumn
1775ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1776ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            data_list.append((line_num,text))
1777ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return
1778ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1779ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # scan text looking for the wrap point, keeping track if the wrap
1780ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # point is inside markers
1781ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        i = 0
1782ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        n = 0
1783ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        mark = ''
1784ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while n < max and i < size:
1785ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if text[i] == '\0':
1786ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i += 1
1787ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                mark = text[i]
1788ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i += 1
1789ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif text[i] == '\1':
1790ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i += 1
1791ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                mark = ''
1792ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1793ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                i += 1
1794ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                n += 1
1795ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1796ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # wrap point is inside text, break it up into separate lines
1797ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        line1 = text[:i]
1798ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        line2 = text[i:]
1799ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1800ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # if wrap point is inside markers, place end marker at end of first
1801ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # line and start marker at beginning of second line because each
1802ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # line will have its own table tag markup around it.
1803ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if mark:
1804ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line1 = line1 + '\1'
1805ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line2 = '\0' + mark + line2
1806ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1807ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # tack on first line onto the output list
1808ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        data_list.append((line_num,line1))
1809ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1810ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # use this routine again to wrap the remaining text
1811ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._split_line(data_list,'>',line2)
1812ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1813ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _line_wrapper(self,diffs):
1814ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns iterator that splits (wraps) mdiff text lines"""
1815ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1816ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # pull from/to data and flags from mdiff iterator
1817ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for fromdata,todata,flag in diffs:
1818ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # check for context separators and pass them through
1819ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flag is None:
1820ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield fromdata,todata,flag
1821ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
1822ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            (fromline,fromtext),(toline,totext) = fromdata,todata
1823ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # for each from/to line split it at the wrap column to form
1824ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # list of text lines.
1825ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            fromlist,tolist = [],[]
1826ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self._split_line(fromlist,fromline,fromtext)
1827ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self._split_line(tolist,toline,totext)
1828ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # yield from/to line in pairs inserting blank lines as
1829ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # necessary when one side has more wrapped lines
1830ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while fromlist or tolist:
1831ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if fromlist:
1832ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    fromdata = fromlist.pop(0)
1833ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
1834ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    fromdata = ('',' ')
1835ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if tolist:
1836ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    todata = tolist.pop(0)
1837ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
1838ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    todata = ('',' ')
1839ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                yield fromdata,todata,flag
1840ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1841ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _collect_lines(self,diffs):
1842ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Collects mdiff output into separate lists
1843ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1844ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Before storing the mdiff from/to data into a list, it is converted
1845ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        into a single line of text with HTML markup.
1846ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1847ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1848ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlist,tolist,flaglist = [],[],[]
1849ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # pull from/to data and flags from mdiff style iterator
1850ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for fromdata,todata,flag in diffs:
1851ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            try:
1852ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # store HTML markup of the lines into the lists
1853ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                fromlist.append(self._format_line(0,flag,*fromdata))
1854ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tolist.append(self._format_line(1,flag,*todata))
1855ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            except TypeError:
1856ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # exceptions occur for lines where context separators go
1857ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                fromlist.append(None)
1858ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tolist.append(None)
1859ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            flaglist.append(flag)
1860ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return fromlist,tolist,flaglist
1861ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1862ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _format_line(self,side,flag,linenum,text):
1863ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns HTML markup of "from" / "to" text lines
1864ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1865ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        side -- 0 or 1 indicating "from" or "to" text
1866ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        flag -- indicates if difference on line
1867ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        linenum -- line number (used for line number column)
1868ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        text -- line text to be marked up
1869ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1870ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        try:
1871ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            linenum = '%d' % linenum
1872ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            id = ' id="%s%s"' % (self._prefix[side],linenum)
1873ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        except TypeError:
1874ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # handle blank lines where linenum is '>' or ''
1875ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            id = ''
1876ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # replace those things that would get confused with HTML symbols
1877ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1878ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1879ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # make space non-breakable so they don't get compressed or line wrapped
1880ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        text = text.replace(' ','&nbsp;').rstrip()
1881ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1882ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1883ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh               % (id,linenum,text)
1884ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1885ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _make_prefix(self):
1886ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Create unique anchor prefixes"""
1887ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1888ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Generate a unique anchor prefix so multiple tables
1889ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # can exist on the same HTML page without conflicts.
1890ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromprefix = "from%d_" % HtmlDiff._default_prefix
1891ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        toprefix = "to%d_" % HtmlDiff._default_prefix
1892ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        HtmlDiff._default_prefix += 1
1893ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # store prefixes so line format method has access
1894ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._prefix = [fromprefix,toprefix]
1895ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1896ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1897ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Makes list of "next" links"""
1898ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1899ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # all anchor names will be generated using the unique "to" prefix
1900ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        toprefix = self._prefix[1]
1901ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1902ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # process change flags, generating middle column of next anchors/links
1903ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        next_id = ['']*len(flaglist)
1904ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        next_href = ['']*len(flaglist)
1905ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        num_chg, in_change = 0, False
1906ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        last = 0
1907ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i,flag in enumerate(flaglist):
1908ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flag:
1909ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if not in_change:
1910ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    in_change = True
1911ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    last = i
1912ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # at the beginning of a change, drop an anchor a few lines
1913ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # (the context lines) before the change for the previous
1914ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # link
1915ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    i = max([0,i-numlines])
1916ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1917ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # at the beginning of a change, drop a link to the next
1918ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # change
1919ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    num_chg += 1
1920ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1921ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                         toprefix,num_chg)
1922ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1923ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                in_change = False
1924ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # check for cases where there is no content to avoid exceptions
1925ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not flaglist:
1926ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            flaglist = [False]
1927ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            next_id = ['']
1928ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            next_href = ['']
1929ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            last = 0
1930ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if context:
1931ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1932ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tolist = fromlist
1933ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1934ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1935ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # if not a change on first line, drop a link
1936ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not flaglist[0]:
1937ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1938ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # redo the last link to link to the top
1939ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1940ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1941ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return fromlist,tolist,flaglist,next_href,next_id
1942ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1943ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1944ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                   numlines=5):
1945ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns HTML table of side by side comparison with change highlights
1946ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1947ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        Arguments:
1948ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlines -- list of "from" lines
1949ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        tolines -- list of "to" lines
1950ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromdesc -- "from" file column header string
1951ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        todesc -- "to" file column header string
1952ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        context -- set to True for contextual differences (defaults to False
1953ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            which shows full differences).
1954ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        numlines -- number of context lines.  When context is set True,
1955ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            controls number of lines displayed before and after the change.
1956ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            When context is False, controls the number of lines to place
1957ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            the "next" link anchors before the next change (so click of
1958ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            "next" link jumps to just before the change).
1959ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
1960ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1961ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # make unique anchor prefixes so that multiple tables may exist
1962ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # on the same page without conflict.
1963ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._make_prefix()
1964ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1965ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # change tabs to spaces before it gets more difficult after we insert
1966ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # markkup
1967ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
1968ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1969ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # create diffs iterator which generates side by side from/to data
1970ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if context:
1971ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            context_lines = numlines
1972ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
1973ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            context_lines = None
1974ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
1975ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                      charjunk=self._charjunk)
1976ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1977ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # set up iterator to wrap lines that exceed desired width
1978ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self._wrapcolumn:
1979ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            diffs = self._line_wrapper(diffs)
1980ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1981ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # collect up from/to lines and flags into lists (also format the lines)
1982ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlist,tolist,flaglist = self._collect_lines(diffs)
1983ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1984ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # process change flags, generating middle column of next anchors/links
1985ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
1986ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            fromlist,tolist,flaglist,context,numlines)
1987ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
1988ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        s = []
1989ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
1990ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh              '<td class="diff_next">%s</td>%s</tr>\n'
1991ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i in range(len(flaglist)):
1992ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flaglist[i] is None:
1993ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # mdiff yields None on separator lines skip the bogus ones
1994ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # generated for the first line
1995ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if i > 0:
1996ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    s.append('        </tbody>        \n        <tbody>\n')
1997ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
1998ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                s.append( fmt % (next_id[i],next_href[i],fromlist[i],
1999ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                                           next_href[i],tolist[i]))
2000ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if fromdesc or todesc:
2001ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
2002ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                '<th class="diff_next"><br /></th>',
2003ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
2004ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                '<th class="diff_next"><br /></th>',
2005ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                '<th colspan="2" class="diff_header">%s</th>' % todesc)
2006ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
2007ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            header_row = ''
2008ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2009ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        table = self._table_template % dict(
2010ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            data_rows=''.join(s),
2011ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            header_row=header_row,
2012ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            prefix=self._prefix[1])
2013ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2014ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return table.replace('\0+','<span class="diff_add">'). \
2015ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     replace('\0-','<span class="diff_sub">'). \
2016ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     replace('\0^','<span class="diff_chg">'). \
2017ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     replace('\1','</span>'). \
2018ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                     replace('\t','&nbsp;')
2019ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2020ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdel re
2021ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2022ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef restore(delta, which):
2023ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    r"""
2024ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Generate one of the two sequences that generated a delta.
2025ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2026ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
2027ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    lines originating from file 1 or 2 (parameter `which`), stripping off line
2028ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefixes.
2029ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2030ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Examples:
2031ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2032ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
2033ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ...              'ore\ntree\nemu\n'.splitlines(1))
2034ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> diff = list(diff)
2035ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> print ''.join(restore(diff, 1)),
2036ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    one
2037ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    two
2038ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    three
2039ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    >>> print ''.join(restore(diff, 2)),
2040ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ore
2041ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    tree
2042ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emu
2043ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
2044ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    try:
2045ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        tag = {1: "- ", 2: "+ "}[int(which)]
2046ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    except KeyError:
2047ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
2048ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                           % which)
2049ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefixes = ("  ", tag)
2050ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for line in delta:
2051ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if line[:2] in prefixes:
2052ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            yield line[2:]
2053ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2054ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _test():
2055ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    import doctest, difflib
2056ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return doctest.testmod(difflib)
2057ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
2058ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehif __name__ == "__main__":
2059ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _test()
2060