1"""
2Module difflib -- helpers for computing deltas between objects.
3
4Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
5    Use SequenceMatcher to return list of the best "good enough" matches.
6
7Function context_diff(a, b):
8    For two lists of strings, return a delta in context diff format.
9
10Function ndiff(a, b):
11    Return a delta: the difference between `a` and `b` (lists of strings).
12
13Function restore(delta, which):
14    Return one of the two sequences that generated an ndiff delta.
15
16Function unified_diff(a, b):
17    For two lists of strings, return a delta in unified diff format.
18
19Class SequenceMatcher:
20    A flexible class for comparing pairs of sequences of any type.
21
22Class Differ:
23    For producing human-readable deltas from sequences of lines of text.
24
25Class HtmlDiff:
26    For producing HTML side by side comparison with change highlights.
27"""
28
29__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
30           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
31           'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
32
33from heapq import nlargest as _nlargest
34from collections import namedtuple as _namedtuple
35
36Match = _namedtuple('Match', 'a b size')
37
38def _calculate_ratio(matches, length):
39    if length:
40        return 2.0 * matches / length
41    return 1.0
42
43class SequenceMatcher:
44
45    """
46    SequenceMatcher is a flexible class for comparing pairs of sequences of
47    any type, so long as the sequence elements are hashable.  The basic
48    algorithm predates, and is a little fancier than, an algorithm
49    published in the late 1980's by Ratcliff and Obershelp under the
50    hyperbolic name "gestalt pattern matching".  The basic idea is to find
51    the longest contiguous matching subsequence that contains no "junk"
52    elements (R-O doesn't address junk).  The same idea is then applied
53    recursively to the pieces of the sequences to the left and to the right
54    of the matching subsequence.  This does not yield minimal edit
55    sequences, but does tend to yield matches that "look right" to people.
56
57    SequenceMatcher tries to compute a "human-friendly diff" between two
58    sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
59    longest *contiguous* & junk-free matching subsequence.  That's what
60    catches peoples' eyes.  The Windows(tm) windiff has another interesting
61    notion, pairing up elements that appear uniquely in each sequence.
62    That, and the method here, appear to yield more intuitive difference
63    reports than does diff.  This method appears to be the least vulnerable
64    to synching up on blocks of "junk lines", though (like blank lines in
65    ordinary text files, or maybe "<P>" lines in HTML files).  That may be
66    because this is the only method of the 3 that has a *concept* of
67    "junk" <wink>.
68
69    Example, comparing two strings, and considering blanks to be "junk":
70
71    >>> s = SequenceMatcher(lambda x: x == " ",
72    ...                     "private Thread currentThread;",
73    ...                     "private volatile Thread currentThread;")
74    >>>
75
76    .ratio() returns a float in [0, 1], measuring the "similarity" of the
77    sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
78    sequences are close matches:
79
80    >>> print(round(s.ratio(), 3))
81    0.866
82    >>>
83
84    If you're only interested in where the sequences match,
85    .get_matching_blocks() is handy:
86
87    >>> for block in s.get_matching_blocks():
88    ...     print("a[%d] and b[%d] match for %d elements" % block)
89    a[0] and b[0] match for 8 elements
90    a[8] and b[17] match for 21 elements
91    a[29] and b[38] match for 0 elements
92
93    Note that the last tuple returned by .get_matching_blocks() is always a
94    dummy, (len(a), len(b), 0), and this is the only case in which the last
95    tuple element (number of elements matched) is 0.
96
97    If you want to know how to change the first sequence into the second,
98    use .get_opcodes():
99
100    >>> for opcode in s.get_opcodes():
101    ...     print("%6s a[%d:%d] b[%d:%d]" % opcode)
102     equal a[0:8] b[0:8]
103    insert a[8:8] b[8:17]
104     equal a[8:29] b[17:38]
105
106    See the Differ class for a fancy human-friendly file differencer, which
107    uses SequenceMatcher both to compare sequences of lines, and to compare
108    sequences of characters within similar (near-matching) lines.
109
110    See also function get_close_matches() in this module, which shows how
111    simple code building on SequenceMatcher can be used to do useful work.
112
113    Timing:  Basic R-O is cubic time worst case and quadratic time expected
114    case.  SequenceMatcher is quadratic time for the worst case and has
115    expected-case behavior dependent in a complicated way on how many
116    elements the sequences have in common; best case time is linear.
117
118    Methods:
119
120    __init__(isjunk=None, a='', b='')
121        Construct a SequenceMatcher.
122
123    set_seqs(a, b)
124        Set the two sequences to be compared.
125
126    set_seq1(a)
127        Set the first sequence to be compared.
128
129    set_seq2(b)
130        Set the second sequence to be compared.
131
132    find_longest_match(alo, ahi, blo, bhi)
133        Find longest matching block in a[alo:ahi] and b[blo:bhi].
134
135    get_matching_blocks()
136        Return list of triples describing matching subsequences.
137
138    get_opcodes()
139        Return list of 5-tuples describing how to turn a into b.
140
141    ratio()
142        Return a measure of the sequences' similarity (float in [0,1]).
143
144    quick_ratio()
145        Return an upper bound on .ratio() relatively quickly.
146
147    real_quick_ratio()
148        Return an upper bound on ratio() very quickly.
149    """
150
151    def __init__(self, isjunk=None, a='', b='', autojunk=True):
152        """Construct a SequenceMatcher.
153
154        Optional arg isjunk is None (the default), or a one-argument
155        function that takes a sequence element and returns true iff the
156        element is junk.  None is equivalent to passing "lambda x: 0", i.e.
157        no elements are considered to be junk.  For example, pass
158            lambda x: x in " \\t"
159        if you're comparing lines as sequences of characters, and don't
160        want to synch up on blanks or hard tabs.
161
162        Optional arg a is the first of two sequences to be compared.  By
163        default, an empty string.  The elements of a must be hashable.  See
164        also .set_seqs() and .set_seq1().
165
166        Optional arg b is the second of two sequences to be compared.  By
167        default, an empty string.  The elements of b must be hashable. See
168        also .set_seqs() and .set_seq2().
169
170        Optional arg autojunk should be set to False to disable the
171        "automatic junk heuristic" that treats popular elements as junk
172        (see module documentation for more information).
173        """
174
175        # Members:
176        # a
177        #      first sequence
178        # b
179        #      second sequence; differences are computed as "what do
180        #      we need to do to 'a' to change it into 'b'?"
181        # b2j
182        #      for x in b, b2j[x] is a list of the indices (into b)
183        #      at which x appears; junk and popular elements do not appear
184        # fullbcount
185        #      for x in b, fullbcount[x] == the number of times x
186        #      appears in b; only materialized if really needed (used
187        #      only for computing quick_ratio())
188        # matching_blocks
189        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
190        #      ascending & non-overlapping in i and in j; terminated by
191        #      a dummy (len(a), len(b), 0) sentinel
192        # opcodes
193        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
194        #      one of
195        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
196        #          'delete'    a[i1:i2] should be deleted
197        #          'insert'    b[j1:j2] should be inserted
198        #          'equal'     a[i1:i2] == b[j1:j2]
199        # isjunk
200        #      a user-supplied function taking a sequence element and
201        #      returning true iff the element is "junk" -- this has
202        #      subtle but helpful effects on the algorithm, which I'll
203        #      get around to writing up someday <0.9 wink>.
204        #      DON'T USE!  Only __chain_b uses this.  Use "in self.bjunk".
205        # bjunk
206        #      the items in b for which isjunk is True.
207        # bpopular
208        #      nonjunk items in b treated as junk by the heuristic (if used).
209
210        self.isjunk = isjunk
211        self.a = self.b = None
212        self.autojunk = autojunk
213        self.set_seqs(a, b)
214
215    def set_seqs(self, a, b):
216        """Set the two sequences to be compared.
217
218        >>> s = SequenceMatcher()
219        >>> s.set_seqs("abcd", "bcde")
220        >>> s.ratio()
221        0.75
222        """
223
224        self.set_seq1(a)
225        self.set_seq2(b)
226
227    def set_seq1(self, a):
228        """Set the first sequence to be compared.
229
230        The second sequence to be compared is not changed.
231
232        >>> s = SequenceMatcher(None, "abcd", "bcde")
233        >>> s.ratio()
234        0.75
235        >>> s.set_seq1("bcde")
236        >>> s.ratio()
237        1.0
238        >>>
239
240        SequenceMatcher computes and caches detailed information about the
241        second sequence, so if you want to compare one sequence S against
242        many sequences, use .set_seq2(S) once and call .set_seq1(x)
243        repeatedly for each of the other sequences.
244
245        See also set_seqs() and set_seq2().
246        """
247
248        if a is self.a:
249            return
250        self.a = a
251        self.matching_blocks = self.opcodes = None
252
253    def set_seq2(self, b):
254        """Set the second sequence to be compared.
255
256        The first sequence to be compared is not changed.
257
258        >>> s = SequenceMatcher(None, "abcd", "bcde")
259        >>> s.ratio()
260        0.75
261        >>> s.set_seq2("abcd")
262        >>> s.ratio()
263        1.0
264        >>>
265
266        SequenceMatcher computes and caches detailed information about the
267        second sequence, so if you want to compare one sequence S against
268        many sequences, use .set_seq2(S) once and call .set_seq1(x)
269        repeatedly for each of the other sequences.
270
271        See also set_seqs() and set_seq1().
272        """
273
274        if b is self.b:
275            return
276        self.b = b
277        self.matching_blocks = self.opcodes = None
278        self.fullbcount = None
279        self.__chain_b()
280
281    # For each element x in b, set b2j[x] to a list of the indices in
282    # b where x appears; the indices are in increasing order; note that
283    # the number of times x appears in b is len(b2j[x]) ...
284    # when self.isjunk is defined, junk elements don't show up in this
285    # map at all, which stops the central find_longest_match method
286    # from starting any matching block at a junk element ...
287    # b2j also does not contain entries for "popular" elements, meaning
288    # elements that account for more than 1 + 1% of the total elements, and
289    # when the sequence is reasonably large (>= 200 elements); this can
290    # be viewed as an adaptive notion of semi-junk, and yields an enormous
291    # speedup when, e.g., comparing program files with hundreds of
292    # instances of "return NULL;" ...
293    # note that this is only called when b changes; so for cross-product
294    # kinds of matches, it's best to call set_seq2 once, then set_seq1
295    # repeatedly
296
297    def __chain_b(self):
298        # Because isjunk is a user-defined (not C) function, and we test
299        # for junk a LOT, it's important to minimize the number of calls.
300        # Before the tricks described here, __chain_b was by far the most
301        # time-consuming routine in the whole module!  If anyone sees
302        # Jim Roskind, thank him again for profile.py -- I never would
303        # have guessed that.
304        # The first trick is to build b2j ignoring the possibility
305        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
306        # out the junk later is much cheaper than building b2j "right"
307        # from the start.
308        b = self.b
309        self.b2j = b2j = {}
310
311        for i, elt in enumerate(b):
312            indices = b2j.setdefault(elt, [])
313            indices.append(i)
314
315        # Purge junk elements
316        self.bjunk = junk = set()
317        isjunk = self.isjunk
318        if isjunk:
319            for elt in b2j.keys():
320                if isjunk(elt):
321                    junk.add(elt)
322            for elt in junk: # separate loop avoids separate list of keys
323                del b2j[elt]
324
325        # Purge popular elements that are not junk
326        self.bpopular = popular = set()
327        n = len(b)
328        if self.autojunk and n >= 200:
329            ntest = n // 100 + 1
330            for elt, idxs in b2j.items():
331                if len(idxs) > ntest:
332                    popular.add(elt)
333            for elt in popular: # ditto; as fast for 1% deletion
334                del b2j[elt]
335
336    def find_longest_match(self, alo, ahi, blo, bhi):
337        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
338
339        If isjunk is not defined:
340
341        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
342            alo <= i <= i+k <= ahi
343            blo <= j <= j+k <= bhi
344        and for all (i',j',k') meeting those conditions,
345            k >= k'
346            i <= i'
347            and if i == i', j <= j'
348
349        In other words, of all maximal matching blocks, return one that
350        starts earliest in a, and of all those maximal matching blocks that
351        start earliest in a, return the one that starts earliest in b.
352
353        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
354        >>> s.find_longest_match(0, 5, 0, 9)
355        Match(a=0, b=4, size=5)
356
357        If isjunk is defined, first the longest matching block is
358        determined as above, but with the additional restriction that no
359        junk element appears in the block.  Then that block is extended as
360        far as possible by matching (only) junk elements on both sides.  So
361        the resulting block never matches on junk except as identical junk
362        happens to be adjacent to an "interesting" match.
363
364        Here's the same example as before, but considering blanks to be
365        junk.  That prevents " abcd" from matching the " abcd" at the tail
366        end of the second sequence directly.  Instead only the "abcd" can
367        match, and matches the leftmost "abcd" in the second sequence:
368
369        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
370        >>> s.find_longest_match(0, 5, 0, 9)
371        Match(a=1, b=0, size=4)
372
373        If no blocks match, return (alo, blo, 0).
374
375        >>> s = SequenceMatcher(None, "ab", "c")
376        >>> s.find_longest_match(0, 2, 0, 1)
377        Match(a=0, b=0, size=0)
378        """
379
380        # CAUTION:  stripping common prefix or suffix would be incorrect.
381        # E.g.,
382        #    ab
383        #    acab
384        # Longest matching block is "ab", but if common prefix is
385        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
386        # strip, so ends up claiming that ab is changed to acab by
387        # inserting "ca" in the middle.  That's minimal but unintuitive:
388        # "it's obvious" that someone inserted "ac" at the front.
389        # Windiff ends up at the same place as diff, but by pairing up
390        # the unique 'b's and then matching the first two 'a's.
391
392        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
393        besti, bestj, bestsize = alo, blo, 0
394        # find longest junk-free match
395        # during an iteration of the loop, j2len[j] = length of longest
396        # junk-free match ending with a[i-1] and b[j]
397        j2len = {}
398        nothing = []
399        for i in range(alo, ahi):
400            # look at all instances of a[i] in b; note that because
401            # b2j has no junk keys, the loop is skipped if a[i] is junk
402            j2lenget = j2len.get
403            newj2len = {}
404            for j in b2j.get(a[i], nothing):
405                # a[i] matches b[j]
406                if j < blo:
407                    continue
408                if j >= bhi:
409                    break
410                k = newj2len[j] = j2lenget(j-1, 0) + 1
411                if k > bestsize:
412                    besti, bestj, bestsize = i-k+1, j-k+1, k
413            j2len = newj2len
414
415        # Extend the best by non-junk elements on each end.  In particular,
416        # "popular" non-junk elements aren't in b2j, which greatly speeds
417        # the inner loop above, but also means "the best" match so far
418        # doesn't contain any junk *or* popular non-junk elements.
419        while besti > alo and bestj > blo and \
420              not isbjunk(b[bestj-1]) and \
421              a[besti-1] == b[bestj-1]:
422            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
423        while besti+bestsize < ahi and bestj+bestsize < bhi and \
424              not isbjunk(b[bestj+bestsize]) and \
425              a[besti+bestsize] == b[bestj+bestsize]:
426            bestsize += 1
427
428        # Now that we have a wholly interesting match (albeit possibly
429        # empty!), we may as well suck up the matching junk on each
430        # side of it too.  Can't think of a good reason not to, and it
431        # saves post-processing the (possibly considerable) expense of
432        # figuring out what to do with it.  In the case of an empty
433        # interesting match, this is clearly the right thing to do,
434        # because no other kind of match is possible in the regions.
435        while besti > alo and bestj > blo and \
436              isbjunk(b[bestj-1]) and \
437              a[besti-1] == b[bestj-1]:
438            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
439        while besti+bestsize < ahi and bestj+bestsize < bhi and \
440              isbjunk(b[bestj+bestsize]) and \
441              a[besti+bestsize] == b[bestj+bestsize]:
442            bestsize = bestsize + 1
443
444        return Match(besti, bestj, bestsize)
445
446    def get_matching_blocks(self):
447        """Return list of triples describing matching subsequences.
448
449        Each triple is of the form (i, j, n), and means that
450        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
451        i and in j.  New in Python 2.5, it's also guaranteed that if
452        (i, j, n) and (i', j', n') are adjacent triples in the list, and
453        the second is not the last triple in the list, then i+n != i' or
454        j+n != j'.  IOW, adjacent triples never describe adjacent equal
455        blocks.
456
457        The last triple is a dummy, (len(a), len(b), 0), and is the only
458        triple with n==0.
459
460        >>> s = SequenceMatcher(None, "abxcd", "abcd")
461        >>> list(s.get_matching_blocks())
462        [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
463        """
464
465        if self.matching_blocks is not None:
466            return self.matching_blocks
467        la, lb = len(self.a), len(self.b)
468
469        # This is most naturally expressed as a recursive algorithm, but
470        # at least one user bumped into extreme use cases that exceeded
471        # the recursion limit on their box.  So, now we maintain a list
472        # ('queue`) of blocks we still need to look at, and append partial
473        # results to `matching_blocks` in a loop; the matches are sorted
474        # at the end.
475        queue = [(0, la, 0, lb)]
476        matching_blocks = []
477        while queue:
478            alo, ahi, blo, bhi = queue.pop()
479            i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
480            # a[alo:i] vs b[blo:j] unknown
481            # a[i:i+k] same as b[j:j+k]
482            # a[i+k:ahi] vs b[j+k:bhi] unknown
483            if k:   # if k is 0, there was no matching block
484                matching_blocks.append(x)
485                if alo < i and blo < j:
486                    queue.append((alo, i, blo, j))
487                if i+k < ahi and j+k < bhi:
488                    queue.append((i+k, ahi, j+k, bhi))
489        matching_blocks.sort()
490
491        # It's possible that we have adjacent equal blocks in the
492        # matching_blocks list now.  Starting with 2.5, this code was added
493        # to collapse them.
494        i1 = j1 = k1 = 0
495        non_adjacent = []
496        for i2, j2, k2 in matching_blocks:
497            # Is this block adjacent to i1, j1, k1?
498            if i1 + k1 == i2 and j1 + k1 == j2:
499                # Yes, so collapse them -- this just increases the length of
500                # the first block by the length of the second, and the first
501                # block so lengthened remains the block to compare against.
502                k1 += k2
503            else:
504                # Not adjacent.  Remember the first block (k1==0 means it's
505                # the dummy we started with), and make the second block the
506                # new block to compare against.
507                if k1:
508                    non_adjacent.append((i1, j1, k1))
509                i1, j1, k1 = i2, j2, k2
510        if k1:
511            non_adjacent.append((i1, j1, k1))
512
513        non_adjacent.append( (la, lb, 0) )
514        self.matching_blocks = list(map(Match._make, non_adjacent))
515        return self.matching_blocks
516
517    def get_opcodes(self):
518        """Return list of 5-tuples describing how to turn a into b.
519
520        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
521        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
522        tuple preceding it, and likewise for j1 == the previous j2.
523
524        The tags are strings, with these meanings:
525
526        'replace':  a[i1:i2] should be replaced by b[j1:j2]
527        'delete':   a[i1:i2] should be deleted.
528                    Note that j1==j2 in this case.
529        'insert':   b[j1:j2] should be inserted at a[i1:i1].
530                    Note that i1==i2 in this case.
531        'equal':    a[i1:i2] == b[j1:j2]
532
533        >>> a = "qabxcd"
534        >>> b = "abycdf"
535        >>> s = SequenceMatcher(None, a, b)
536        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
537        ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
538        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
539         delete a[0:1] (q) b[0:0] ()
540          equal a[1:3] (ab) b[0:2] (ab)
541        replace a[3:4] (x) b[2:3] (y)
542          equal a[4:6] (cd) b[3:5] (cd)
543         insert a[6:6] () b[5:6] (f)
544        """
545
546        if self.opcodes is not None:
547            return self.opcodes
548        i = j = 0
549        self.opcodes = answer = []
550        for ai, bj, size in self.get_matching_blocks():
551            # invariant:  we've pumped out correct diffs to change
552            # a[:i] into b[:j], and the next matching block is
553            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
554            # out a diff to change a[i:ai] into b[j:bj], pump out
555            # the matching block, and move (i,j) beyond the match
556            tag = ''
557            if i < ai and j < bj:
558                tag = 'replace'
559            elif i < ai:
560                tag = 'delete'
561            elif j < bj:
562                tag = 'insert'
563            if tag:
564                answer.append( (tag, i, ai, j, bj) )
565            i, j = ai+size, bj+size
566            # the list of matching blocks is terminated by a
567            # sentinel with size 0
568            if size:
569                answer.append( ('equal', ai, i, bj, j) )
570        return answer
571
572    def get_grouped_opcodes(self, n=3):
573        """ Isolate change clusters by eliminating ranges with no changes.
574
575        Return a generator of groups with up to n lines of context.
576        Each group is in the same format as returned by get_opcodes().
577
578        >>> from pprint import pprint
579        >>> a = list(map(str, range(1,40)))
580        >>> b = a[:]
581        >>> b[8:8] = ['i']     # Make an insertion
582        >>> b[20] += 'x'       # Make a replacement
583        >>> b[23:28] = []      # Make a deletion
584        >>> b[30] += 'y'       # Make another replacement
585        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
586        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
587         [('equal', 16, 19, 17, 20),
588          ('replace', 19, 20, 20, 21),
589          ('equal', 20, 22, 21, 23),
590          ('delete', 22, 27, 23, 23),
591          ('equal', 27, 30, 23, 26)],
592         [('equal', 31, 34, 27, 30),
593          ('replace', 34, 35, 30, 31),
594          ('equal', 35, 38, 31, 34)]]
595        """
596
597        codes = self.get_opcodes()
598        if not codes:
599            codes = [("equal", 0, 1, 0, 1)]
600        # Fixup leading and trailing groups if they show no changes.
601        if codes[0][0] == 'equal':
602            tag, i1, i2, j1, j2 = codes[0]
603            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
604        if codes[-1][0] == 'equal':
605            tag, i1, i2, j1, j2 = codes[-1]
606            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
607
608        nn = n + n
609        group = []
610        for tag, i1, i2, j1, j2 in codes:
611            # End the current group and start a new one whenever
612            # there is a large range with no changes.
613            if tag == 'equal' and i2-i1 > nn:
614                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
615                yield group
616                group = []
617                i1, j1 = max(i1, i2-n), max(j1, j2-n)
618            group.append((tag, i1, i2, j1 ,j2))
619        if group and not (len(group)==1 and group[0][0] == 'equal'):
620            yield group
621
622    def ratio(self):
623        """Return a measure of the sequences' similarity (float in [0,1]).
624
625        Where T is the total number of elements in both sequences, and
626        M is the number of matches, this is 2.0*M / T.
627        Note that this is 1 if the sequences are identical, and 0 if
628        they have nothing in common.
629
630        .ratio() is expensive to compute if you haven't already computed
631        .get_matching_blocks() or .get_opcodes(), in which case you may
632        want to try .quick_ratio() or .real_quick_ratio() first to get an
633        upper bound.
634
635        >>> s = SequenceMatcher(None, "abcd", "bcde")
636        >>> s.ratio()
637        0.75
638        >>> s.quick_ratio()
639        0.75
640        >>> s.real_quick_ratio()
641        1.0
642        """
643
644        matches = sum(triple[-1] for triple in self.get_matching_blocks())
645        return _calculate_ratio(matches, len(self.a) + len(self.b))
646
647    def quick_ratio(self):
648        """Return an upper bound on ratio() relatively quickly.
649
650        This isn't defined beyond that it is an upper bound on .ratio(), and
651        is faster to compute.
652        """
653
654        # viewing a and b as multisets, set matches to the cardinality
655        # of their intersection; this counts the number of matches
656        # without regard to order, so is clearly an upper bound
657        if self.fullbcount is None:
658            self.fullbcount = fullbcount = {}
659            for elt in self.b:
660                fullbcount[elt] = fullbcount.get(elt, 0) + 1
661        fullbcount = self.fullbcount
662        # avail[x] is the number of times x appears in 'b' less the
663        # number of times we've seen it in 'a' so far ... kinda
664        avail = {}
665        availhas, matches = avail.__contains__, 0
666        for elt in self.a:
667            if availhas(elt):
668                numb = avail[elt]
669            else:
670                numb = fullbcount.get(elt, 0)
671            avail[elt] = numb - 1
672            if numb > 0:
673                matches = matches + 1
674        return _calculate_ratio(matches, len(self.a) + len(self.b))
675
676    def real_quick_ratio(self):
677        """Return an upper bound on ratio() very quickly.
678
679        This isn't defined beyond that it is an upper bound on .ratio(), and
680        is faster to compute than either .ratio() or .quick_ratio().
681        """
682
683        la, lb = len(self.a), len(self.b)
684        # can't have more matches than the number of elements in the
685        # shorter sequence
686        return _calculate_ratio(min(la, lb), la + lb)
687
688def get_close_matches(word, possibilities, n=3, cutoff=0.6):
689    """Use SequenceMatcher to return list of the best "good enough" matches.
690
691    word is a sequence for which close matches are desired (typically a
692    string).
693
694    possibilities is a list of sequences against which to match word
695    (typically a list of strings).
696
697    Optional arg n (default 3) is the maximum number of close matches to
698    return.  n must be > 0.
699
700    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
701    that don't score at least that similar to word are ignored.
702
703    The best (no more than n) matches among the possibilities are returned
704    in a list, sorted by similarity score, most similar first.
705
706    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
707    ['apple', 'ape']
708    >>> import keyword as _keyword
709    >>> get_close_matches("wheel", _keyword.kwlist)
710    ['while']
711    >>> get_close_matches("Apple", _keyword.kwlist)
712    []
713    >>> get_close_matches("accept", _keyword.kwlist)
714    ['except']
715    """
716
717    if not n >  0:
718        raise ValueError("n must be > 0: %r" % (n,))
719    if not 0.0 <= cutoff <= 1.0:
720        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
721    result = []
722    s = SequenceMatcher()
723    s.set_seq2(word)
724    for x in possibilities:
725        s.set_seq1(x)
726        if s.real_quick_ratio() >= cutoff and \
727           s.quick_ratio() >= cutoff and \
728           s.ratio() >= cutoff:
729            result.append((s.ratio(), x))
730
731    # Move the best scorers to head of list
732    result = _nlargest(n, result)
733    # Strip scores for the best n matches
734    return [x for score, x in result]
735
736def _count_leading(line, ch):
737    """
738    Return number of `ch` characters at the start of `line`.
739
740    Example:
741
742    >>> _count_leading('   abc', ' ')
743    3
744    """
745
746    i, n = 0, len(line)
747    while i < n and line[i] == ch:
748        i += 1
749    return i
750
751class Differ:
752    r"""
753    Differ is a class for comparing sequences of lines of text, and
754    producing human-readable differences or deltas.  Differ uses
755    SequenceMatcher both to compare sequences of lines, and to compare
756    sequences of characters within similar (near-matching) lines.
757
758    Each line of a Differ delta begins with a two-letter code:
759
760        '- '    line unique to sequence 1
761        '+ '    line unique to sequence 2
762        '  '    line common to both sequences
763        '? '    line not present in either input sequence
764
765    Lines beginning with '? ' attempt to guide the eye to intraline
766    differences, and were not present in either input sequence.  These lines
767    can be confusing if the sequences contain tab characters.
768
769    Note that Differ makes no claim to produce a *minimal* diff.  To the
770    contrary, minimal diffs are often counter-intuitive, because they synch
771    up anywhere possible, sometimes accidental matches 100 pages apart.
772    Restricting synch points to contiguous matches preserves some notion of
773    locality, at the occasional cost of producing a longer diff.
774
775    Example: Comparing two texts.
776
777    First we set up the texts, sequences of individual single-line strings
778    ending with newlines (such sequences can also be obtained from the
779    `readlines()` method of file-like objects):
780
781    >>> text1 = '''  1. Beautiful is better than ugly.
782    ...   2. Explicit is better than implicit.
783    ...   3. Simple is better than complex.
784    ...   4. Complex is better than complicated.
785    ... '''.splitlines(keepends=True)
786    >>> len(text1)
787    4
788    >>> text1[0][-1]
789    '\n'
790    >>> text2 = '''  1. Beautiful is better than ugly.
791    ...   3.   Simple is better than complex.
792    ...   4. Complicated is better than complex.
793    ...   5. Flat is better than nested.
794    ... '''.splitlines(keepends=True)
795
796    Next we instantiate a Differ object:
797
798    >>> d = Differ()
799
800    Note that when instantiating a Differ object we may pass functions to
801    filter out line and character 'junk'.  See Differ.__init__ for details.
802
803    Finally, we compare the two:
804
805    >>> result = list(d.compare(text1, text2))
806
807    'result' is a list of strings, so let's pretty-print it:
808
809    >>> from pprint import pprint as _pprint
810    >>> _pprint(result)
811    ['    1. Beautiful is better than ugly.\n',
812     '-   2. Explicit is better than implicit.\n',
813     '-   3. Simple is better than complex.\n',
814     '+   3.   Simple is better than complex.\n',
815     '?     ++\n',
816     '-   4. Complex is better than complicated.\n',
817     '?            ^                     ---- ^\n',
818     '+   4. Complicated is better than complex.\n',
819     '?           ++++ ^                      ^\n',
820     '+   5. Flat is better than nested.\n']
821
822    As a single multi-line string it looks like this:
823
824    >>> print(''.join(result), end="")
825        1. Beautiful is better than ugly.
826    -   2. Explicit is better than implicit.
827    -   3. Simple is better than complex.
828    +   3.   Simple is better than complex.
829    ?     ++
830    -   4. Complex is better than complicated.
831    ?            ^                     ---- ^
832    +   4. Complicated is better than complex.
833    ?           ++++ ^                      ^
834    +   5. Flat is better than nested.
835
836    Methods:
837
838    __init__(linejunk=None, charjunk=None)
839        Construct a text differencer, with optional filters.
840
841    compare(a, b)
842        Compare two sequences of lines; generate the resulting delta.
843    """
844
845    def __init__(self, linejunk=None, charjunk=None):
846        """
847        Construct a text differencer, with optional filters.
848
849        The two optional keyword parameters are for filter functions:
850
851        - `linejunk`: A function that should accept a single string argument,
852          and return true iff the string is junk. The module-level function
853          `IS_LINE_JUNK` may be used to filter out lines without visible
854          characters, except for at most one splat ('#').  It is recommended
855          to leave linejunk None; the underlying SequenceMatcher class has
856          an adaptive notion of "noise" lines that's better than any static
857          definition the author has ever been able to craft.
858
859        - `charjunk`: A function that should accept a string of length 1. The
860          module-level function `IS_CHARACTER_JUNK` may be used to filter out
861          whitespace characters (a blank or tab; **note**: bad idea to include
862          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
863        """
864
865        self.linejunk = linejunk
866        self.charjunk = charjunk
867
868    def compare(self, a, b):
869        r"""
870        Compare two sequences of lines; generate the resulting delta.
871
872        Each sequence must contain individual single-line strings ending with
873        newlines. Such sequences can be obtained from the `readlines()` method
874        of file-like objects.  The delta generated also consists of newline-
875        terminated strings, ready to be printed as-is via the writeline()
876        method of a file-like object.
877
878        Example:
879
880        >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
881        ...                                'ore\ntree\nemu\n'.splitlines(True))),
882        ...       end="")
883        - one
884        ?  ^
885        + ore
886        ?  ^
887        - two
888        - three
889        ?  -
890        + tree
891        + emu
892        """
893
894        cruncher = SequenceMatcher(self.linejunk, a, b)
895        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
896            if tag == 'replace':
897                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
898            elif tag == 'delete':
899                g = self._dump('-', a, alo, ahi)
900            elif tag == 'insert':
901                g = self._dump('+', b, blo, bhi)
902            elif tag == 'equal':
903                g = self._dump(' ', a, alo, ahi)
904            else:
905                raise ValueError('unknown tag %r' % (tag,))
906
907            yield from g
908
909    def _dump(self, tag, x, lo, hi):
910        """Generate comparison results for a same-tagged range."""
911        for i in range(lo, hi):
912            yield '%s %s' % (tag, x[i])
913
914    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
915        assert alo < ahi and blo < bhi
916        # dump the shorter block first -- reduces the burden on short-term
917        # memory if the blocks are of very different sizes
918        if bhi - blo < ahi - alo:
919            first  = self._dump('+', b, blo, bhi)
920            second = self._dump('-', a, alo, ahi)
921        else:
922            first  = self._dump('-', a, alo, ahi)
923            second = self._dump('+', b, blo, bhi)
924
925        for g in first, second:
926            yield from g
927
928    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
929        r"""
930        When replacing one block of lines with another, search the blocks
931        for *similar* lines; the best-matching pair (if any) is used as a
932        synch point, and intraline difference marking is done on the
933        similar pair. Lots of work, but often worth it.
934
935        Example:
936
937        >>> d = Differ()
938        >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
939        ...                            ['abcdefGhijkl\n'], 0, 1)
940        >>> print(''.join(results), end="")
941        - abcDefghiJkl
942        ?    ^  ^  ^
943        + abcdefGhijkl
944        ?    ^  ^  ^
945        """
946
947        # don't synch up unless the lines have a similarity score of at
948        # least cutoff; best_ratio tracks the best score seen so far
949        best_ratio, cutoff = 0.74, 0.75
950        cruncher = SequenceMatcher(self.charjunk)
951        eqi, eqj = None, None   # 1st indices of equal lines (if any)
952
953        # search for the pair that matches best without being identical
954        # (identical lines must be junk lines, & we don't want to synch up
955        # on junk -- unless we have to)
956        for j in range(blo, bhi):
957            bj = b[j]
958            cruncher.set_seq2(bj)
959            for i in range(alo, ahi):
960                ai = a[i]
961                if ai == bj:
962                    if eqi is None:
963                        eqi, eqj = i, j
964                    continue
965                cruncher.set_seq1(ai)
966                # computing similarity is expensive, so use the quick
967                # upper bounds first -- have seen this speed up messy
968                # compares by a factor of 3.
969                # note that ratio() is only expensive to compute the first
970                # time it's called on a sequence pair; the expensive part
971                # of the computation is cached by cruncher
972                if cruncher.real_quick_ratio() > best_ratio and \
973                      cruncher.quick_ratio() > best_ratio and \
974                      cruncher.ratio() > best_ratio:
975                    best_ratio, best_i, best_j = cruncher.ratio(), i, j
976        if best_ratio < cutoff:
977            # no non-identical "pretty close" pair
978            if eqi is None:
979                # no identical pair either -- treat it as a straight replace
980                yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
981                return
982            # no close pair, but an identical pair -- synch up on that
983            best_i, best_j, best_ratio = eqi, eqj, 1.0
984        else:
985            # there's a close pair, so forget the identical pair (if any)
986            eqi = None
987
988        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
989        # identical
990
991        # pump out diffs from before the synch point
992        yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
993
994        # do intraline marking on the synch pair
995        aelt, belt = a[best_i], b[best_j]
996        if eqi is None:
997            # pump out a '-', '?', '+', '?' quad for the synched lines
998            atags = btags = ""
999            cruncher.set_seqs(aelt, belt)
1000            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
1001                la, lb = ai2 - ai1, bj2 - bj1
1002                if tag == 'replace':
1003                    atags += '^' * la
1004                    btags += '^' * lb
1005                elif tag == 'delete':
1006                    atags += '-' * la
1007                elif tag == 'insert':
1008                    btags += '+' * lb
1009                elif tag == 'equal':
1010                    atags += ' ' * la
1011                    btags += ' ' * lb
1012                else:
1013                    raise ValueError('unknown tag %r' % (tag,))
1014            yield from self._qformat(aelt, belt, atags, btags)
1015        else:
1016            # the synch pair is identical
1017            yield '  ' + aelt
1018
1019        # pump out diffs from after the synch point
1020        yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
1021
1022    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1023        g = []
1024        if alo < ahi:
1025            if blo < bhi:
1026                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1027            else:
1028                g = self._dump('-', a, alo, ahi)
1029        elif blo < bhi:
1030            g = self._dump('+', b, blo, bhi)
1031
1032        yield from g
1033
1034    def _qformat(self, aline, bline, atags, btags):
1035        r"""
1036        Format "?" output and deal with leading tabs.
1037
1038        Example:
1039
1040        >>> d = Differ()
1041        >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
1042        ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
1043        >>> for line in results: print(repr(line))
1044        ...
1045        '- \tabcDefghiJkl\n'
1046        '? \t ^ ^  ^\n'
1047        '+ \tabcdefGhijkl\n'
1048        '? \t ^ ^  ^\n'
1049        """
1050
1051        # Can hurt, but will probably help most of the time.
1052        common = min(_count_leading(aline, "\t"),
1053                     _count_leading(bline, "\t"))
1054        common = min(common, _count_leading(atags[:common], " "))
1055        common = min(common, _count_leading(btags[:common], " "))
1056        atags = atags[common:].rstrip()
1057        btags = btags[common:].rstrip()
1058
1059        yield "- " + aline
1060        if atags:
1061            yield "? %s%s\n" % ("\t" * common, atags)
1062
1063        yield "+ " + bline
1064        if btags:
1065            yield "? %s%s\n" % ("\t" * common, btags)
1066
1067# With respect to junk, an earlier version of ndiff simply refused to
1068# *start* a match with a junk element.  The result was cases like this:
1069#     before: private Thread currentThread;
1070#     after:  private volatile Thread currentThread;
1071# If you consider whitespace to be junk, the longest contiguous match
1072# not starting with junk is "e Thread currentThread".  So ndiff reported
1073# that "e volatil" was inserted between the 't' and the 'e' in "private".
1074# While an accurate view, to people that's absurd.  The current version
1075# looks for matching blocks that are entirely junk-free, then extends the
1076# longest one of those as far as possible but only with matching junk.
1077# So now "currentThread" is matched, then extended to suck up the
1078# preceding blank; then "private" is matched, and extended to suck up the
1079# following blank; then "Thread" is matched; and finally ndiff reports
1080# that "volatile " was inserted before "Thread".  The only quibble
1081# remaining is that perhaps it was really the case that " volatile"
1082# was inserted after "private".  I can live with that <wink>.
1083
1084import re
1085
1086def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
1087    r"""
1088    Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
1089
1090    Examples:
1091
1092    >>> IS_LINE_JUNK('\n')
1093    True
1094    >>> IS_LINE_JUNK('  #   \n')
1095    True
1096    >>> IS_LINE_JUNK('hello\n')
1097    False
1098    """
1099
1100    return pat(line) is not None
1101
1102def IS_CHARACTER_JUNK(ch, ws=" \t"):
1103    r"""
1104    Return 1 for ignorable character: iff `ch` is a space or tab.
1105
1106    Examples:
1107
1108    >>> IS_CHARACTER_JUNK(' ')
1109    True
1110    >>> IS_CHARACTER_JUNK('\t')
1111    True
1112    >>> IS_CHARACTER_JUNK('\n')
1113    False
1114    >>> IS_CHARACTER_JUNK('x')
1115    False
1116    """
1117
1118    return ch in ws
1119
1120
1121########################################################################
1122###  Unified Diff
1123########################################################################
1124
1125def _format_range_unified(start, stop):
1126    'Convert range to the "ed" format'
1127    # Per the diff spec at http://www.unix.org/single_unix_specification/
1128    beginning = start + 1     # lines start numbering with one
1129    length = stop - start
1130    if length == 1:
1131        return '{}'.format(beginning)
1132    if not length:
1133        beginning -= 1        # empty ranges begin at line just before the range
1134    return '{},{}'.format(beginning, length)
1135
1136def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1137                 tofiledate='', n=3, lineterm='\n'):
1138    r"""
1139    Compare two sequences of lines; generate the delta as a unified diff.
1140
1141    Unified diffs are a compact way of showing line changes and a few
1142    lines of context.  The number of context lines is set by 'n' which
1143    defaults to three.
1144
1145    By default, the diff control lines (those with ---, +++, or @@) are
1146    created with a trailing newline.  This is helpful so that inputs
1147    created from file.readlines() result in diffs that are suitable for
1148    file.writelines() since both the inputs and outputs have trailing
1149    newlines.
1150
1151    For inputs that do not have trailing newlines, set the lineterm
1152    argument to "" so that the output will be uniformly newline free.
1153
1154    The unidiff format normally has a header for filenames and modification
1155    times.  Any or all of these may be specified using strings for
1156    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1157    The modification times are normally expressed in the ISO 8601 format.
1158
1159    Example:
1160
1161    >>> for line in unified_diff('one two three four'.split(),
1162    ...             'zero one tree four'.split(), 'Original', 'Current',
1163    ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
1164    ...             lineterm=''):
1165    ...     print(line)                 # doctest: +NORMALIZE_WHITESPACE
1166    --- Original        2005-01-26 23:30:50
1167    +++ Current         2010-04-02 10:20:52
1168    @@ -1,4 +1,4 @@
1169    +zero
1170     one
1171    -two
1172    -three
1173    +tree
1174     four
1175    """
1176
1177    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
1178    started = False
1179    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1180        if not started:
1181            started = True
1182            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1183            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1184            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
1185            yield '+++ {}{}{}'.format(tofile, todate, lineterm)
1186
1187        first, last = group[0], group[-1]
1188        file1_range = _format_range_unified(first[1], last[2])
1189        file2_range = _format_range_unified(first[3], last[4])
1190        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
1191
1192        for tag, i1, i2, j1, j2 in group:
1193            if tag == 'equal':
1194                for line in a[i1:i2]:
1195                    yield ' ' + line
1196                continue
1197            if tag in {'replace', 'delete'}:
1198                for line in a[i1:i2]:
1199                    yield '-' + line
1200            if tag in {'replace', 'insert'}:
1201                for line in b[j1:j2]:
1202                    yield '+' + line
1203
1204
1205########################################################################
1206###  Context Diff
1207########################################################################
1208
1209def _format_range_context(start, stop):
1210    'Convert range to the "ed" format'
1211    # Per the diff spec at http://www.unix.org/single_unix_specification/
1212    beginning = start + 1     # lines start numbering with one
1213    length = stop - start
1214    if not length:
1215        beginning -= 1        # empty ranges begin at line just before the range
1216    if length <= 1:
1217        return '{}'.format(beginning)
1218    return '{},{}'.format(beginning, beginning + length - 1)
1219
1220# See http://www.unix.org/single_unix_specification/
1221def context_diff(a, b, fromfile='', tofile='',
1222                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1223    r"""
1224    Compare two sequences of lines; generate the delta as a context diff.
1225
1226    Context diffs are a compact way of showing line changes and a few
1227    lines of context.  The number of context lines is set by 'n' which
1228    defaults to three.
1229
1230    By default, the diff control lines (those with *** or ---) are
1231    created with a trailing newline.  This is helpful so that inputs
1232    created from file.readlines() result in diffs that are suitable for
1233    file.writelines() since both the inputs and outputs have trailing
1234    newlines.
1235
1236    For inputs that do not have trailing newlines, set the lineterm
1237    argument to "" so that the output will be uniformly newline free.
1238
1239    The context diff format normally has a header for filenames and
1240    modification times.  Any or all of these may be specified using
1241    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1242    The modification times are normally expressed in the ISO 8601 format.
1243    If not specified, the strings default to blanks.
1244
1245    Example:
1246
1247    >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
1248    ...       'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
1249    ...       end="")
1250    *** Original
1251    --- Current
1252    ***************
1253    *** 1,4 ****
1254      one
1255    ! two
1256    ! three
1257      four
1258    --- 1,4 ----
1259    + zero
1260      one
1261    ! tree
1262      four
1263    """
1264
1265    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
1266    prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
1267    started = False
1268    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1269        if not started:
1270            started = True
1271            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1272            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1273            yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
1274            yield '--- {}{}{}'.format(tofile, todate, lineterm)
1275
1276        first, last = group[0], group[-1]
1277        yield '***************' + lineterm
1278
1279        file1_range = _format_range_context(first[1], last[2])
1280        yield '*** {} ****{}'.format(file1_range, lineterm)
1281
1282        if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
1283            for tag, i1, i2, _, _ in group:
1284                if tag != 'insert':
1285                    for line in a[i1:i2]:
1286                        yield prefix[tag] + line
1287
1288        file2_range = _format_range_context(first[3], last[4])
1289        yield '--- {} ----{}'.format(file2_range, lineterm)
1290
1291        if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
1292            for tag, _, _, j1, j2 in group:
1293                if tag != 'delete':
1294                    for line in b[j1:j2]:
1295                        yield prefix[tag] + line
1296
1297def _check_types(a, b, *args):
1298    # Checking types is weird, but the alternative is garbled output when
1299    # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
1300    # without this check, passing filenames as bytes results in output like
1301    #   --- b'oldfile.txt'
1302    #   +++ b'newfile.txt'
1303    # because of how str.format() incorporates bytes objects.
1304    if a and not isinstance(a[0], str):
1305        raise TypeError('lines to compare must be str, not %s (%r)' %
1306                        (type(a[0]).__name__, a[0]))
1307    if b and not isinstance(b[0], str):
1308        raise TypeError('lines to compare must be str, not %s (%r)' %
1309                        (type(b[0]).__name__, b[0]))
1310    for arg in args:
1311        if not isinstance(arg, str):
1312            raise TypeError('all arguments must be str, not: %r' % (arg,))
1313
1314def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
1315               fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
1316    r"""
1317    Compare `a` and `b`, two sequences of lines represented as bytes rather
1318    than str. This is a wrapper for `dfunc`, which is typically either
1319    unified_diff() or context_diff(). Inputs are losslessly converted to
1320    strings so that `dfunc` only has to worry about strings, and encoded
1321    back to bytes on return. This is necessary to compare files with
1322    unknown or inconsistent encoding. All other inputs (except `n`) must be
1323    bytes rather than str.
1324    """
1325    def decode(s):
1326        try:
1327            return s.decode('ascii', 'surrogateescape')
1328        except AttributeError as err:
1329            msg = ('all arguments must be bytes, not %s (%r)' %
1330                   (type(s).__name__, s))
1331            raise TypeError(msg) from err
1332    a = list(map(decode, a))
1333    b = list(map(decode, b))
1334    fromfile = decode(fromfile)
1335    tofile = decode(tofile)
1336    fromfiledate = decode(fromfiledate)
1337    tofiledate = decode(tofiledate)
1338    lineterm = decode(lineterm)
1339
1340    lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
1341    for line in lines:
1342        yield line.encode('ascii', 'surrogateescape')
1343
1344def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1345    r"""
1346    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1347
1348    Optional keyword parameters `linejunk` and `charjunk` are for filter
1349    functions, or can be None:
1350
1351    - linejunk: A function that should accept a single string argument and
1352      return true iff the string is junk.  The default is None, and is
1353      recommended; the underlying SequenceMatcher class has an adaptive
1354      notion of "noise" lines.
1355
1356    - charjunk: A function that accepts a character (string of length
1357      1), and returns true iff the character is junk. The default is
1358      the module-level function IS_CHARACTER_JUNK, which filters out
1359      whitespace characters (a blank or tab; note: it's a bad idea to
1360      include newline in this!).
1361
1362    Tools/scripts/ndiff.py is a command-line front-end to this function.
1363
1364    Example:
1365
1366    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
1367    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
1368    >>> print(''.join(diff), end="")
1369    - one
1370    ?  ^
1371    + ore
1372    ?  ^
1373    - two
1374    - three
1375    ?  -
1376    + tree
1377    + emu
1378    """
1379    return Differ(linejunk, charjunk).compare(a, b)
1380
1381def _mdiff(fromlines, tolines, context=None, linejunk=None,
1382           charjunk=IS_CHARACTER_JUNK):
1383    r"""Returns generator yielding marked up from/to side by side differences.
1384
1385    Arguments:
1386    fromlines -- list of text lines to compared to tolines
1387    tolines -- list of text lines to be compared to fromlines
1388    context -- number of context lines to display on each side of difference,
1389               if None, all from/to text lines will be generated.
1390    linejunk -- passed on to ndiff (see ndiff documentation)
1391    charjunk -- passed on to ndiff (see ndiff documentation)
1392
1393    This function returns an iterator which returns a tuple:
1394    (from line tuple, to line tuple, boolean flag)
1395
1396    from/to line tuple -- (line num, line text)
1397        line num -- integer or None (to indicate a context separation)
1398        line text -- original line text with following markers inserted:
1399            '\0+' -- marks start of added text
1400            '\0-' -- marks start of deleted text
1401            '\0^' -- marks start of changed text
1402            '\1' -- marks end of added/deleted/changed text
1403
1404    boolean flag -- None indicates context separation, True indicates
1405        either "from" or "to" line contains a change, otherwise False.
1406
1407    This function/iterator was originally developed to generate side by side
1408    file difference for making HTML pages (see HtmlDiff class for example
1409    usage).
1410
1411    Note, this function utilizes the ndiff function to generate the side by
1412    side difference markup.  Optional ndiff arguments may be passed to this
1413    function and they in turn will be passed to ndiff.
1414    """
1415    import re
1416
1417    # regular expression for finding intraline change indices
1418    change_re = re.compile(r'(\++|\-+|\^+)')
1419
1420    # create the difference iterator to generate the differences
1421    diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1422
1423    def _make_line(lines, format_key, side, num_lines=[0,0]):
1424        """Returns line of text with user's change markup and line formatting.
1425
1426        lines -- list of lines from the ndiff generator to produce a line of
1427                 text from.  When producing the line of text to return, the
1428                 lines used are removed from this list.
1429        format_key -- '+' return first line in list with "add" markup around
1430                          the entire line.
1431                      '-' return first line in list with "delete" markup around
1432                          the entire line.
1433                      '?' return first line in list with add/delete/change
1434                          intraline markup (indices obtained from second line)
1435                      None return first line in list with no markup
1436        side -- indice into the num_lines list (0=from,1=to)
1437        num_lines -- from/to current line number.  This is NOT intended to be a
1438                     passed parameter.  It is present as a keyword argument to
1439                     maintain memory of the current line numbers between calls
1440                     of this function.
1441
1442        Note, this function is purposefully not defined at the module scope so
1443        that data it needs from its parent function (within whose context it
1444        is defined) does not need to be of module scope.
1445        """
1446        num_lines[side] += 1
1447        # Handle case where no user markup is to be added, just return line of
1448        # text with user's line format to allow for usage of the line number.
1449        if format_key is None:
1450            return (num_lines[side],lines.pop(0)[2:])
1451        # Handle case of intraline changes
1452        if format_key == '?':
1453            text, markers = lines.pop(0), lines.pop(0)
1454            # find intraline changes (store change type and indices in tuples)
1455            sub_info = []
1456            def record_sub_info(match_object,sub_info=sub_info):
1457                sub_info.append([match_object.group(1)[0],match_object.span()])
1458                return match_object.group(1)
1459            change_re.sub(record_sub_info,markers)
1460            # process each tuple inserting our special marks that won't be
1461            # noticed by an xml/html escaper.
1462            for key,(begin,end) in reversed(sub_info):
1463                text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1464            text = text[2:]
1465        # Handle case of add/delete entire line
1466        else:
1467            text = lines.pop(0)[2:]
1468            # if line of text is just a newline, insert a space so there is
1469            # something for the user to highlight and see.
1470            if not text:
1471                text = ' '
1472            # insert marks that won't be noticed by an xml/html escaper.
1473            text = '\0' + format_key + text + '\1'
1474        # Return line of text, first allow user's line formatter to do its
1475        # thing (such as adding the line number) then replace the special
1476        # marks with what the user's change markup.
1477        return (num_lines[side],text)
1478
1479    def _line_iterator():
1480        """Yields from/to lines of text with a change indication.
1481
1482        This function is an iterator.  It itself pulls lines from a
1483        differencing iterator, processes them and yields them.  When it can
1484        it yields both a "from" and a "to" line, otherwise it will yield one
1485        or the other.  In addition to yielding the lines of from/to text, a
1486        boolean flag is yielded to indicate if the text line(s) have
1487        differences in them.
1488
1489        Note, this function is purposefully not defined at the module scope so
1490        that data it needs from its parent function (within whose context it
1491        is defined) does not need to be of module scope.
1492        """
1493        lines = []
1494        num_blanks_pending, num_blanks_to_yield = 0, 0
1495        while True:
1496            # Load up next 4 lines so we can look ahead, create strings which
1497            # are a concatenation of the first character of each of the 4 lines
1498            # so we can do some very readable comparisons.
1499            while len(lines) < 4:
1500                lines.append(next(diff_lines_iterator, 'X'))
1501            s = ''.join([line[0] for line in lines])
1502            if s.startswith('X'):
1503                # When no more lines, pump out any remaining blank lines so the
1504                # corresponding add/delete lines get a matching blank line so
1505                # all line pairs get yielded at the next level.
1506                num_blanks_to_yield = num_blanks_pending
1507            elif s.startswith('-?+?'):
1508                # simple intraline change
1509                yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1510                continue
1511            elif s.startswith('--++'):
1512                # in delete block, add block coming: we do NOT want to get
1513                # caught up on blank lines yet, just process the delete line
1514                num_blanks_pending -= 1
1515                yield _make_line(lines,'-',0), None, True
1516                continue
1517            elif s.startswith(('--?+', '--+', '- ')):
1518                # in delete block and see an intraline change or unchanged line
1519                # coming: yield the delete line and then blanks
1520                from_line,to_line = _make_line(lines,'-',0), None
1521                num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1522            elif s.startswith('-+?'):
1523                # intraline change
1524                yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1525                continue
1526            elif s.startswith('-?+'):
1527                # intraline change
1528                yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1529                continue
1530            elif s.startswith('-'):
1531                # delete FROM line
1532                num_blanks_pending -= 1
1533                yield _make_line(lines,'-',0), None, True
1534                continue
1535            elif s.startswith('+--'):
1536                # in add block, delete block coming: we do NOT want to get
1537                # caught up on blank lines yet, just process the add line
1538                num_blanks_pending += 1
1539                yield None, _make_line(lines,'+',1), True
1540                continue
1541            elif s.startswith(('+ ', '+-')):
1542                # will be leaving an add block: yield blanks then add line
1543                from_line, to_line = None, _make_line(lines,'+',1)
1544                num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1545            elif s.startswith('+'):
1546                # inside an add block, yield the add line
1547                num_blanks_pending += 1
1548                yield None, _make_line(lines,'+',1), True
1549                continue
1550            elif s.startswith(' '):
1551                # unchanged text, yield it to both sides
1552                yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1553                continue
1554            # Catch up on the blank lines so when we yield the next from/to
1555            # pair, they are lined up.
1556            while(num_blanks_to_yield < 0):
1557                num_blanks_to_yield += 1
1558                yield None,('','\n'),True
1559            while(num_blanks_to_yield > 0):
1560                num_blanks_to_yield -= 1
1561                yield ('','\n'),None,True
1562            if s.startswith('X'):
1563                return
1564            else:
1565                yield from_line,to_line,True
1566
1567    def _line_pair_iterator():
1568        """Yields from/to lines of text with a change indication.
1569
1570        This function is an iterator.  It itself pulls lines from the line
1571        iterator.  Its difference from that iterator is that this function
1572        always yields a pair of from/to text lines (with the change
1573        indication).  If necessary it will collect single from/to lines
1574        until it has a matching pair from/to pair to yield.
1575
1576        Note, this function is purposefully not defined at the module scope so
1577        that data it needs from its parent function (within whose context it
1578        is defined) does not need to be of module scope.
1579        """
1580        line_iterator = _line_iterator()
1581        fromlines,tolines=[],[]
1582        while True:
1583            # Collecting lines of text until we have a from/to pair
1584            while (len(fromlines)==0 or len(tolines)==0):
1585                try:
1586                    from_line, to_line, found_diff = next(line_iterator)
1587                except StopIteration:
1588                    return
1589                if from_line is not None:
1590                    fromlines.append((from_line,found_diff))
1591                if to_line is not None:
1592                    tolines.append((to_line,found_diff))
1593            # Once we have a pair, remove them from the collection and yield it
1594            from_line, fromDiff = fromlines.pop(0)
1595            to_line, to_diff = tolines.pop(0)
1596            yield (from_line,to_line,fromDiff or to_diff)
1597
1598    # Handle case where user does not want context differencing, just yield
1599    # them up without doing anything else with them.
1600    line_pair_iterator = _line_pair_iterator()
1601    if context is None:
1602        yield from line_pair_iterator
1603    # Handle case where user wants context differencing.  We must do some
1604    # storage of lines until we know for sure that they are to be yielded.
1605    else:
1606        context += 1
1607        lines_to_write = 0
1608        while True:
1609            # Store lines up until we find a difference, note use of a
1610            # circular queue because we only need to keep around what
1611            # we need for context.
1612            index, contextLines = 0, [None]*(context)
1613            found_diff = False
1614            while(found_diff is False):
1615                try:
1616                    from_line, to_line, found_diff = next(line_pair_iterator)
1617                except StopIteration:
1618                    return
1619                i = index % context
1620                contextLines[i] = (from_line, to_line, found_diff)
1621                index += 1
1622            # Yield lines that we have collected so far, but first yield
1623            # the user's separator.
1624            if index > context:
1625                yield None, None, None
1626                lines_to_write = context
1627            else:
1628                lines_to_write = index
1629                index = 0
1630            while(lines_to_write):
1631                i = index % context
1632                index += 1
1633                yield contextLines[i]
1634                lines_to_write -= 1
1635            # Now yield the context lines after the change
1636            lines_to_write = context-1
1637            try:
1638                while(lines_to_write):
1639                    from_line, to_line, found_diff = next(line_pair_iterator)
1640                    # If another change within the context, extend the context
1641                    if found_diff:
1642                        lines_to_write = context-1
1643                    else:
1644                        lines_to_write -= 1
1645                    yield from_line, to_line, found_diff
1646            except StopIteration:
1647                # Catch exception from next() and return normally
1648                return
1649
1650
1651_file_template = """
1652<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1653          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1654
1655<html>
1656
1657<head>
1658    <meta http-equiv="Content-Type"
1659          content="text/html; charset=%(charset)s" />
1660    <title></title>
1661    <style type="text/css">%(styles)s
1662    </style>
1663</head>
1664
1665<body>
1666    %(table)s%(legend)s
1667</body>
1668
1669</html>"""
1670
1671_styles = """
1672        table.diff {font-family:Courier; border:medium;}
1673        .diff_header {background-color:#e0e0e0}
1674        td.diff_header {text-align:right}
1675        .diff_next {background-color:#c0c0c0}
1676        .diff_add {background-color:#aaffaa}
1677        .diff_chg {background-color:#ffff77}
1678        .diff_sub {background-color:#ffaaaa}"""
1679
1680_table_template = """
1681    <table class="diff" id="difflib_chg_%(prefix)s_top"
1682           cellspacing="0" cellpadding="0" rules="groups" >
1683        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1684        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1685        %(header_row)s
1686        <tbody>
1687%(data_rows)s        </tbody>
1688    </table>"""
1689
1690_legend = """
1691    <table class="diff" summary="Legends">
1692        <tr> <th colspan="2"> Legends </th> </tr>
1693        <tr> <td> <table border="" summary="Colors">
1694                      <tr><th> Colors </th> </tr>
1695                      <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1696                      <tr><td class="diff_chg">Changed</td> </tr>
1697                      <tr><td class="diff_sub">Deleted</td> </tr>
1698                  </table></td>
1699             <td> <table border="" summary="Links">
1700                      <tr><th colspan="2"> Links </th> </tr>
1701                      <tr><td>(f)irst change</td> </tr>
1702                      <tr><td>(n)ext change</td> </tr>
1703                      <tr><td>(t)op</td> </tr>
1704                  </table></td> </tr>
1705    </table>"""
1706
1707class HtmlDiff(object):
1708    """For producing HTML side by side comparison with change highlights.
1709
1710    This class can be used to create an HTML table (or a complete HTML file
1711    containing the table) showing a side by side, line by line comparison
1712    of text with inter-line and intra-line change highlights.  The table can
1713    be generated in either full or contextual difference mode.
1714
1715    The following methods are provided for HTML generation:
1716
1717    make_table -- generates HTML for a single side by side table
1718    make_file -- generates complete HTML file with a single side by side table
1719
1720    See tools/scripts/diff.py for an example usage of this class.
1721    """
1722
1723    _file_template = _file_template
1724    _styles = _styles
1725    _table_template = _table_template
1726    _legend = _legend
1727    _default_prefix = 0
1728
1729    def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1730                 charjunk=IS_CHARACTER_JUNK):
1731        """HtmlDiff instance initializer
1732
1733        Arguments:
1734        tabsize -- tab stop spacing, defaults to 8.
1735        wrapcolumn -- column number where lines are broken and wrapped,
1736            defaults to None where lines are not wrapped.
1737        linejunk,charjunk -- keyword arguments passed into ndiff() (used by
1738            HtmlDiff() to generate the side by side HTML differences).  See
1739            ndiff() documentation for argument default values and descriptions.
1740        """
1741        self._tabsize = tabsize
1742        self._wrapcolumn = wrapcolumn
1743        self._linejunk = linejunk
1744        self._charjunk = charjunk
1745
1746    def make_file(self, fromlines, tolines, fromdesc='', todesc='',
1747                  context=False, numlines=5, *, charset='utf-8'):
1748        """Returns HTML file of side by side comparison with change highlights
1749
1750        Arguments:
1751        fromlines -- list of "from" lines
1752        tolines -- list of "to" lines
1753        fromdesc -- "from" file column header string
1754        todesc -- "to" file column header string
1755        context -- set to True for contextual differences (defaults to False
1756            which shows full differences).
1757        numlines -- number of context lines.  When context is set True,
1758            controls number of lines displayed before and after the change.
1759            When context is False, controls the number of lines to place
1760            the "next" link anchors before the next change (so click of
1761            "next" link jumps to just before the change).
1762        charset -- charset of the HTML document
1763        """
1764
1765        return (self._file_template % dict(
1766            styles=self._styles,
1767            legend=self._legend,
1768            table=self.make_table(fromlines, tolines, fromdesc, todesc,
1769                                  context=context, numlines=numlines),
1770            charset=charset
1771        )).encode(charset, 'xmlcharrefreplace').decode(charset)
1772
1773    def _tab_newline_replace(self,fromlines,tolines):
1774        """Returns from/to line lists with tabs expanded and newlines removed.
1775
1776        Instead of tab characters being replaced by the number of spaces
1777        needed to fill in to the next tab stop, this function will fill
1778        the space with tab characters.  This is done so that the difference
1779        algorithms can identify changes in a file when tabs are replaced by
1780        spaces and vice versa.  At the end of the HTML generation, the tab
1781        characters will be replaced with a nonbreakable space.
1782        """
1783        def expand_tabs(line):
1784            # hide real spaces
1785            line = line.replace(' ','\0')
1786            # expand tabs into spaces
1787            line = line.expandtabs(self._tabsize)
1788            # replace spaces from expanded tabs back into tab characters
1789            # (we'll replace them with markup after we do differencing)
1790            line = line.replace(' ','\t')
1791            return line.replace('\0',' ').rstrip('\n')
1792        fromlines = [expand_tabs(line) for line in fromlines]
1793        tolines = [expand_tabs(line) for line in tolines]
1794        return fromlines,tolines
1795
1796    def _split_line(self,data_list,line_num,text):
1797        """Builds list of text lines by splitting text lines at wrap point
1798
1799        This function will determine if the input text line needs to be
1800        wrapped (split) into separate lines.  If so, the first wrap point
1801        will be determined and the first line appended to the output
1802        text line list.  This function is used recursively to handle
1803        the second part of the split line to further split it.
1804        """
1805        # if blank line or context separator, just add it to the output list
1806        if not line_num:
1807            data_list.append((line_num,text))
1808            return
1809
1810        # if line text doesn't need wrapping, just add it to the output list
1811        size = len(text)
1812        max = self._wrapcolumn
1813        if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1814            data_list.append((line_num,text))
1815            return
1816
1817        # scan text looking for the wrap point, keeping track if the wrap
1818        # point is inside markers
1819        i = 0
1820        n = 0
1821        mark = ''
1822        while n < max and i < size:
1823            if text[i] == '\0':
1824                i += 1
1825                mark = text[i]
1826                i += 1
1827            elif text[i] == '\1':
1828                i += 1
1829                mark = ''
1830            else:
1831                i += 1
1832                n += 1
1833
1834        # wrap point is inside text, break it up into separate lines
1835        line1 = text[:i]
1836        line2 = text[i:]
1837
1838        # if wrap point is inside markers, place end marker at end of first
1839        # line and start marker at beginning of second line because each
1840        # line will have its own table tag markup around it.
1841        if mark:
1842            line1 = line1 + '\1'
1843            line2 = '\0' + mark + line2
1844
1845        # tack on first line onto the output list
1846        data_list.append((line_num,line1))
1847
1848        # use this routine again to wrap the remaining text
1849        self._split_line(data_list,'>',line2)
1850
1851    def _line_wrapper(self,diffs):
1852        """Returns iterator that splits (wraps) mdiff text lines"""
1853
1854        # pull from/to data and flags from mdiff iterator
1855        for fromdata,todata,flag in diffs:
1856            # check for context separators and pass them through
1857            if flag is None:
1858                yield fromdata,todata,flag
1859                continue
1860            (fromline,fromtext),(toline,totext) = fromdata,todata
1861            # for each from/to line split it at the wrap column to form
1862            # list of text lines.
1863            fromlist,tolist = [],[]
1864            self._split_line(fromlist,fromline,fromtext)
1865            self._split_line(tolist,toline,totext)
1866            # yield from/to line in pairs inserting blank lines as
1867            # necessary when one side has more wrapped lines
1868            while fromlist or tolist:
1869                if fromlist:
1870                    fromdata = fromlist.pop(0)
1871                else:
1872                    fromdata = ('',' ')
1873                if tolist:
1874                    todata = tolist.pop(0)
1875                else:
1876                    todata = ('',' ')
1877                yield fromdata,todata,flag
1878
1879    def _collect_lines(self,diffs):
1880        """Collects mdiff output into separate lists
1881
1882        Before storing the mdiff from/to data into a list, it is converted
1883        into a single line of text with HTML markup.
1884        """
1885
1886        fromlist,tolist,flaglist = [],[],[]
1887        # pull from/to data and flags from mdiff style iterator
1888        for fromdata,todata,flag in diffs:
1889            try:
1890                # store HTML markup of the lines into the lists
1891                fromlist.append(self._format_line(0,flag,*fromdata))
1892                tolist.append(self._format_line(1,flag,*todata))
1893            except TypeError:
1894                # exceptions occur for lines where context separators go
1895                fromlist.append(None)
1896                tolist.append(None)
1897            flaglist.append(flag)
1898        return fromlist,tolist,flaglist
1899
1900    def _format_line(self,side,flag,linenum,text):
1901        """Returns HTML markup of "from" / "to" text lines
1902
1903        side -- 0 or 1 indicating "from" or "to" text
1904        flag -- indicates if difference on line
1905        linenum -- line number (used for line number column)
1906        text -- line text to be marked up
1907        """
1908        try:
1909            linenum = '%d' % linenum
1910            id = ' id="%s%s"' % (self._prefix[side],linenum)
1911        except TypeError:
1912            # handle blank lines where linenum is '>' or ''
1913            id = ''
1914        # replace those things that would get confused with HTML symbols
1915        text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1916
1917        # make space non-breakable so they don't get compressed or line wrapped
1918        text = text.replace(' ','&nbsp;').rstrip()
1919
1920        return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1921               % (id,linenum,text)
1922
1923    def _make_prefix(self):
1924        """Create unique anchor prefixes"""
1925
1926        # Generate a unique anchor prefix so multiple tables
1927        # can exist on the same HTML page without conflicts.
1928        fromprefix = "from%d_" % HtmlDiff._default_prefix
1929        toprefix = "to%d_" % HtmlDiff._default_prefix
1930        HtmlDiff._default_prefix += 1
1931        # store prefixes so line format method has access
1932        self._prefix = [fromprefix,toprefix]
1933
1934    def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1935        """Makes list of "next" links"""
1936
1937        # all anchor names will be generated using the unique "to" prefix
1938        toprefix = self._prefix[1]
1939
1940        # process change flags, generating middle column of next anchors/links
1941        next_id = ['']*len(flaglist)
1942        next_href = ['']*len(flaglist)
1943        num_chg, in_change = 0, False
1944        last = 0
1945        for i,flag in enumerate(flaglist):
1946            if flag:
1947                if not in_change:
1948                    in_change = True
1949                    last = i
1950                    # at the beginning of a change, drop an anchor a few lines
1951                    # (the context lines) before the change for the previous
1952                    # link
1953                    i = max([0,i-numlines])
1954                    next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1955                    # at the beginning of a change, drop a link to the next
1956                    # change
1957                    num_chg += 1
1958                    next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1959                         toprefix,num_chg)
1960            else:
1961                in_change = False
1962        # check for cases where there is no content to avoid exceptions
1963        if not flaglist:
1964            flaglist = [False]
1965            next_id = ['']
1966            next_href = ['']
1967            last = 0
1968            if context:
1969                fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1970                tolist = fromlist
1971            else:
1972                fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1973        # if not a change on first line, drop a link
1974        if not flaglist[0]:
1975            next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1976        # redo the last link to link to the top
1977        next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1978
1979        return fromlist,tolist,flaglist,next_href,next_id
1980
1981    def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1982                   numlines=5):
1983        """Returns HTML table of side by side comparison with change highlights
1984
1985        Arguments:
1986        fromlines -- list of "from" lines
1987        tolines -- list of "to" lines
1988        fromdesc -- "from" file column header string
1989        todesc -- "to" file column header string
1990        context -- set to True for contextual differences (defaults to False
1991            which shows full differences).
1992        numlines -- number of context lines.  When context is set True,
1993            controls number of lines displayed before and after the change.
1994            When context is False, controls the number of lines to place
1995            the "next" link anchors before the next change (so click of
1996            "next" link jumps to just before the change).
1997        """
1998
1999        # make unique anchor prefixes so that multiple tables may exist
2000        # on the same page without conflict.
2001        self._make_prefix()
2002
2003        # change tabs to spaces before it gets more difficult after we insert
2004        # markup
2005        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
2006
2007        # create diffs iterator which generates side by side from/to data
2008        if context:
2009            context_lines = numlines
2010        else:
2011            context_lines = None
2012        diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
2013                      charjunk=self._charjunk)
2014
2015        # set up iterator to wrap lines that exceed desired width
2016        if self._wrapcolumn:
2017            diffs = self._line_wrapper(diffs)
2018
2019        # collect up from/to lines and flags into lists (also format the lines)
2020        fromlist,tolist,flaglist = self._collect_lines(diffs)
2021
2022        # process change flags, generating middle column of next anchors/links
2023        fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
2024            fromlist,tolist,flaglist,context,numlines)
2025
2026        s = []
2027        fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
2028              '<td class="diff_next">%s</td>%s</tr>\n'
2029        for i in range(len(flaglist)):
2030            if flaglist[i] is None:
2031                # mdiff yields None on separator lines skip the bogus ones
2032                # generated for the first line
2033                if i > 0:
2034                    s.append('        </tbody>        \n        <tbody>\n')
2035            else:
2036                s.append( fmt % (next_id[i],next_href[i],fromlist[i],
2037                                           next_href[i],tolist[i]))
2038        if fromdesc or todesc:
2039            header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
2040                '<th class="diff_next"><br /></th>',
2041                '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
2042                '<th class="diff_next"><br /></th>',
2043                '<th colspan="2" class="diff_header">%s</th>' % todesc)
2044        else:
2045            header_row = ''
2046
2047        table = self._table_template % dict(
2048            data_rows=''.join(s),
2049            header_row=header_row,
2050            prefix=self._prefix[1])
2051
2052        return table.replace('\0+','<span class="diff_add">'). \
2053                     replace('\0-','<span class="diff_sub">'). \
2054                     replace('\0^','<span class="diff_chg">'). \
2055                     replace('\1','</span>'). \
2056                     replace('\t','&nbsp;')
2057
2058del re
2059
2060def restore(delta, which):
2061    r"""
2062    Generate one of the two sequences that generated a delta.
2063
2064    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
2065    lines originating from file 1 or 2 (parameter `which`), stripping off line
2066    prefixes.
2067
2068    Examples:
2069
2070    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
2071    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
2072    >>> diff = list(diff)
2073    >>> print(''.join(restore(diff, 1)), end="")
2074    one
2075    two
2076    three
2077    >>> print(''.join(restore(diff, 2)), end="")
2078    ore
2079    tree
2080    emu
2081    """
2082    try:
2083        tag = {1: "- ", 2: "+ "}[int(which)]
2084    except KeyError:
2085        raise ValueError('unknown delta choice (must be 1 or 2): %r'
2086                           % which) from None
2087    prefixes = ("  ", tag)
2088    for line in delta:
2089        if line[:2] in prefixes:
2090            yield line[2:]
2091
2092def _test():
2093    import doctest, difflib
2094    return doctest.testmod(difflib)
2095
2096if __name__ == "__main__":
2097    _test()
2098