1""" 2Module difflib -- helpers for computing deltas between objects. 3 4Function get_close_matches(word, possibilities, n=3, cutoff=0.6): 5 Use SequenceMatcher to return list of the best "good enough" matches. 6 7Function context_diff(a, b): 8 For two lists of strings, return a delta in context diff format. 9 10Function ndiff(a, b): 11 Return a delta: the difference between `a` and `b` (lists of strings). 12 13Function restore(delta, which): 14 Return one of the two sequences that generated an ndiff delta. 15 16Function unified_diff(a, b): 17 For two lists of strings, return a delta in unified diff format. 18 19Class SequenceMatcher: 20 A flexible class for comparing pairs of sequences of any type. 21 22Class Differ: 23 For producing human-readable deltas from sequences of lines of text. 24 25Class HtmlDiff: 26 For producing HTML side by side comparison with change highlights. 27""" 28 29__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 30 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', 31 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match'] 32 33from heapq import nlargest as _nlargest 34from collections import namedtuple as _namedtuple 35 36Match = _namedtuple('Match', 'a b size') 37 38def _calculate_ratio(matches, length): 39 if length: 40 return 2.0 * matches / length 41 return 1.0 42 43class SequenceMatcher: 44 45 """ 46 SequenceMatcher is a flexible class for comparing pairs of sequences of 47 any type, so long as the sequence elements are hashable. The basic 48 algorithm predates, and is a little fancier than, an algorithm 49 published in the late 1980's by Ratcliff and Obershelp under the 50 hyperbolic name "gestalt pattern matching". The basic idea is to find 51 the longest contiguous matching subsequence that contains no "junk" 52 elements (R-O doesn't address junk). The same idea is then applied 53 recursively to the pieces of the sequences to the left and to the right 54 of the matching subsequence. This does not yield minimal edit 55 sequences, but does tend to yield matches that "look right" to people. 56 57 SequenceMatcher tries to compute a "human-friendly diff" between two 58 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the 59 longest *contiguous* & junk-free matching subsequence. That's what 60 catches peoples' eyes. The Windows(tm) windiff has another interesting 61 notion, pairing up elements that appear uniquely in each sequence. 62 That, and the method here, appear to yield more intuitive difference 63 reports than does diff. This method appears to be the least vulnerable 64 to synching up on blocks of "junk lines", though (like blank lines in 65 ordinary text files, or maybe "<P>" lines in HTML files). That may be 66 because this is the only method of the 3 that has a *concept* of 67 "junk" <wink>. 68 69 Example, comparing two strings, and considering blanks to be "junk": 70 71 >>> s = SequenceMatcher(lambda x: x == " ", 72 ... "private Thread currentThread;", 73 ... "private volatile Thread currentThread;") 74 >>> 75 76 .ratio() returns a float in [0, 1], measuring the "similarity" of the 77 sequences. As a rule of thumb, a .ratio() value over 0.6 means the 78 sequences are close matches: 79 80 >>> print(round(s.ratio(), 3)) 81 0.866 82 >>> 83 84 If you're only interested in where the sequences match, 85 .get_matching_blocks() is handy: 86 87 >>> for block in s.get_matching_blocks(): 88 ... print("a[%d] and b[%d] match for %d elements" % block) 89 a[0] and b[0] match for 8 elements 90 a[8] and b[17] match for 21 elements 91 a[29] and b[38] match for 0 elements 92 93 Note that the last tuple returned by .get_matching_blocks() is always a 94 dummy, (len(a), len(b), 0), and this is the only case in which the last 95 tuple element (number of elements matched) is 0. 96 97 If you want to know how to change the first sequence into the second, 98 use .get_opcodes(): 99 100 >>> for opcode in s.get_opcodes(): 101 ... print("%6s a[%d:%d] b[%d:%d]" % opcode) 102 equal a[0:8] b[0:8] 103 insert a[8:8] b[8:17] 104 equal a[8:29] b[17:38] 105 106 See the Differ class for a fancy human-friendly file differencer, which 107 uses SequenceMatcher both to compare sequences of lines, and to compare 108 sequences of characters within similar (near-matching) lines. 109 110 See also function get_close_matches() in this module, which shows how 111 simple code building on SequenceMatcher can be used to do useful work. 112 113 Timing: Basic R-O is cubic time worst case and quadratic time expected 114 case. SequenceMatcher is quadratic time for the worst case and has 115 expected-case behavior dependent in a complicated way on how many 116 elements the sequences have in common; best case time is linear. 117 118 Methods: 119 120 __init__(isjunk=None, a='', b='') 121 Construct a SequenceMatcher. 122 123 set_seqs(a, b) 124 Set the two sequences to be compared. 125 126 set_seq1(a) 127 Set the first sequence to be compared. 128 129 set_seq2(b) 130 Set the second sequence to be compared. 131 132 find_longest_match(alo, ahi, blo, bhi) 133 Find longest matching block in a[alo:ahi] and b[blo:bhi]. 134 135 get_matching_blocks() 136 Return list of triples describing matching subsequences. 137 138 get_opcodes() 139 Return list of 5-tuples describing how to turn a into b. 140 141 ratio() 142 Return a measure of the sequences' similarity (float in [0,1]). 143 144 quick_ratio() 145 Return an upper bound on .ratio() relatively quickly. 146 147 real_quick_ratio() 148 Return an upper bound on ratio() very quickly. 149 """ 150 151 def __init__(self, isjunk=None, a='', b='', autojunk=True): 152 """Construct a SequenceMatcher. 153 154 Optional arg isjunk is None (the default), or a one-argument 155 function that takes a sequence element and returns true iff the 156 element is junk. None is equivalent to passing "lambda x: 0", i.e. 157 no elements are considered to be junk. For example, pass 158 lambda x: x in " \\t" 159 if you're comparing lines as sequences of characters, and don't 160 want to synch up on blanks or hard tabs. 161 162 Optional arg a is the first of two sequences to be compared. By 163 default, an empty string. The elements of a must be hashable. See 164 also .set_seqs() and .set_seq1(). 165 166 Optional arg b is the second of two sequences to be compared. By 167 default, an empty string. The elements of b must be hashable. See 168 also .set_seqs() and .set_seq2(). 169 170 Optional arg autojunk should be set to False to disable the 171 "automatic junk heuristic" that treats popular elements as junk 172 (see module documentation for more information). 173 """ 174 175 # Members: 176 # a 177 # first sequence 178 # b 179 # second sequence; differences are computed as "what do 180 # we need to do to 'a' to change it into 'b'?" 181 # b2j 182 # for x in b, b2j[x] is a list of the indices (into b) 183 # at which x appears; junk and popular elements do not appear 184 # fullbcount 185 # for x in b, fullbcount[x] == the number of times x 186 # appears in b; only materialized if really needed (used 187 # only for computing quick_ratio()) 188 # matching_blocks 189 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; 190 # ascending & non-overlapping in i and in j; terminated by 191 # a dummy (len(a), len(b), 0) sentinel 192 # opcodes 193 # a list of (tag, i1, i2, j1, j2) tuples, where tag is 194 # one of 195 # 'replace' a[i1:i2] should be replaced by b[j1:j2] 196 # 'delete' a[i1:i2] should be deleted 197 # 'insert' b[j1:j2] should be inserted 198 # 'equal' a[i1:i2] == b[j1:j2] 199 # isjunk 200 # a user-supplied function taking a sequence element and 201 # returning true iff the element is "junk" -- this has 202 # subtle but helpful effects on the algorithm, which I'll 203 # get around to writing up someday <0.9 wink>. 204 # DON'T USE! Only __chain_b uses this. Use "in self.bjunk". 205 # bjunk 206 # the items in b for which isjunk is True. 207 # bpopular 208 # nonjunk items in b treated as junk by the heuristic (if used). 209 210 self.isjunk = isjunk 211 self.a = self.b = None 212 self.autojunk = autojunk 213 self.set_seqs(a, b) 214 215 def set_seqs(self, a, b): 216 """Set the two sequences to be compared. 217 218 >>> s = SequenceMatcher() 219 >>> s.set_seqs("abcd", "bcde") 220 >>> s.ratio() 221 0.75 222 """ 223 224 self.set_seq1(a) 225 self.set_seq2(b) 226 227 def set_seq1(self, a): 228 """Set the first sequence to be compared. 229 230 The second sequence to be compared is not changed. 231 232 >>> s = SequenceMatcher(None, "abcd", "bcde") 233 >>> s.ratio() 234 0.75 235 >>> s.set_seq1("bcde") 236 >>> s.ratio() 237 1.0 238 >>> 239 240 SequenceMatcher computes and caches detailed information about the 241 second sequence, so if you want to compare one sequence S against 242 many sequences, use .set_seq2(S) once and call .set_seq1(x) 243 repeatedly for each of the other sequences. 244 245 See also set_seqs() and set_seq2(). 246 """ 247 248 if a is self.a: 249 return 250 self.a = a 251 self.matching_blocks = self.opcodes = None 252 253 def set_seq2(self, b): 254 """Set the second sequence to be compared. 255 256 The first sequence to be compared is not changed. 257 258 >>> s = SequenceMatcher(None, "abcd", "bcde") 259 >>> s.ratio() 260 0.75 261 >>> s.set_seq2("abcd") 262 >>> s.ratio() 263 1.0 264 >>> 265 266 SequenceMatcher computes and caches detailed information about the 267 second sequence, so if you want to compare one sequence S against 268 many sequences, use .set_seq2(S) once and call .set_seq1(x) 269 repeatedly for each of the other sequences. 270 271 See also set_seqs() and set_seq1(). 272 """ 273 274 if b is self.b: 275 return 276 self.b = b 277 self.matching_blocks = self.opcodes = None 278 self.fullbcount = None 279 self.__chain_b() 280 281 # For each element x in b, set b2j[x] to a list of the indices in 282 # b where x appears; the indices are in increasing order; note that 283 # the number of times x appears in b is len(b2j[x]) ... 284 # when self.isjunk is defined, junk elements don't show up in this 285 # map at all, which stops the central find_longest_match method 286 # from starting any matching block at a junk element ... 287 # b2j also does not contain entries for "popular" elements, meaning 288 # elements that account for more than 1 + 1% of the total elements, and 289 # when the sequence is reasonably large (>= 200 elements); this can 290 # be viewed as an adaptive notion of semi-junk, and yields an enormous 291 # speedup when, e.g., comparing program files with hundreds of 292 # instances of "return NULL;" ... 293 # note that this is only called when b changes; so for cross-product 294 # kinds of matches, it's best to call set_seq2 once, then set_seq1 295 # repeatedly 296 297 def __chain_b(self): 298 # Because isjunk is a user-defined (not C) function, and we test 299 # for junk a LOT, it's important to minimize the number of calls. 300 # Before the tricks described here, __chain_b was by far the most 301 # time-consuming routine in the whole module! If anyone sees 302 # Jim Roskind, thank him again for profile.py -- I never would 303 # have guessed that. 304 # The first trick is to build b2j ignoring the possibility 305 # of junk. I.e., we don't call isjunk at all yet. Throwing 306 # out the junk later is much cheaper than building b2j "right" 307 # from the start. 308 b = self.b 309 self.b2j = b2j = {} 310 311 for i, elt in enumerate(b): 312 indices = b2j.setdefault(elt, []) 313 indices.append(i) 314 315 # Purge junk elements 316 self.bjunk = junk = set() 317 isjunk = self.isjunk 318 if isjunk: 319 for elt in b2j.keys(): 320 if isjunk(elt): 321 junk.add(elt) 322 for elt in junk: # separate loop avoids separate list of keys 323 del b2j[elt] 324 325 # Purge popular elements that are not junk 326 self.bpopular = popular = set() 327 n = len(b) 328 if self.autojunk and n >= 200: 329 ntest = n // 100 + 1 330 for elt, idxs in b2j.items(): 331 if len(idxs) > ntest: 332 popular.add(elt) 333 for elt in popular: # ditto; as fast for 1% deletion 334 del b2j[elt] 335 336 def find_longest_match(self, alo, ahi, blo, bhi): 337 """Find longest matching block in a[alo:ahi] and b[blo:bhi]. 338 339 If isjunk is not defined: 340 341 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where 342 alo <= i <= i+k <= ahi 343 blo <= j <= j+k <= bhi 344 and for all (i',j',k') meeting those conditions, 345 k >= k' 346 i <= i' 347 and if i == i', j <= j' 348 349 In other words, of all maximal matching blocks, return one that 350 starts earliest in a, and of all those maximal matching blocks that 351 start earliest in a, return the one that starts earliest in b. 352 353 >>> s = SequenceMatcher(None, " abcd", "abcd abcd") 354 >>> s.find_longest_match(0, 5, 0, 9) 355 Match(a=0, b=4, size=5) 356 357 If isjunk is defined, first the longest matching block is 358 determined as above, but with the additional restriction that no 359 junk element appears in the block. Then that block is extended as 360 far as possible by matching (only) junk elements on both sides. So 361 the resulting block never matches on junk except as identical junk 362 happens to be adjacent to an "interesting" match. 363 364 Here's the same example as before, but considering blanks to be 365 junk. That prevents " abcd" from matching the " abcd" at the tail 366 end of the second sequence directly. Instead only the "abcd" can 367 match, and matches the leftmost "abcd" in the second sequence: 368 369 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") 370 >>> s.find_longest_match(0, 5, 0, 9) 371 Match(a=1, b=0, size=4) 372 373 If no blocks match, return (alo, blo, 0). 374 375 >>> s = SequenceMatcher(None, "ab", "c") 376 >>> s.find_longest_match(0, 2, 0, 1) 377 Match(a=0, b=0, size=0) 378 """ 379 380 # CAUTION: stripping common prefix or suffix would be incorrect. 381 # E.g., 382 # ab 383 # acab 384 # Longest matching block is "ab", but if common prefix is 385 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so 386 # strip, so ends up claiming that ab is changed to acab by 387 # inserting "ca" in the middle. That's minimal but unintuitive: 388 # "it's obvious" that someone inserted "ac" at the front. 389 # Windiff ends up at the same place as diff, but by pairing up 390 # the unique 'b's and then matching the first two 'a's. 391 392 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ 393 besti, bestj, bestsize = alo, blo, 0 394 # find longest junk-free match 395 # during an iteration of the loop, j2len[j] = length of longest 396 # junk-free match ending with a[i-1] and b[j] 397 j2len = {} 398 nothing = [] 399 for i in range(alo, ahi): 400 # look at all instances of a[i] in b; note that because 401 # b2j has no junk keys, the loop is skipped if a[i] is junk 402 j2lenget = j2len.get 403 newj2len = {} 404 for j in b2j.get(a[i], nothing): 405 # a[i] matches b[j] 406 if j < blo: 407 continue 408 if j >= bhi: 409 break 410 k = newj2len[j] = j2lenget(j-1, 0) + 1 411 if k > bestsize: 412 besti, bestj, bestsize = i-k+1, j-k+1, k 413 j2len = newj2len 414 415 # Extend the best by non-junk elements on each end. In particular, 416 # "popular" non-junk elements aren't in b2j, which greatly speeds 417 # the inner loop above, but also means "the best" match so far 418 # doesn't contain any junk *or* popular non-junk elements. 419 while besti > alo and bestj > blo and \ 420 not isbjunk(b[bestj-1]) and \ 421 a[besti-1] == b[bestj-1]: 422 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 423 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 424 not isbjunk(b[bestj+bestsize]) and \ 425 a[besti+bestsize] == b[bestj+bestsize]: 426 bestsize += 1 427 428 # Now that we have a wholly interesting match (albeit possibly 429 # empty!), we may as well suck up the matching junk on each 430 # side of it too. Can't think of a good reason not to, and it 431 # saves post-processing the (possibly considerable) expense of 432 # figuring out what to do with it. In the case of an empty 433 # interesting match, this is clearly the right thing to do, 434 # because no other kind of match is possible in the regions. 435 while besti > alo and bestj > blo and \ 436 isbjunk(b[bestj-1]) and \ 437 a[besti-1] == b[bestj-1]: 438 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 439 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 440 isbjunk(b[bestj+bestsize]) and \ 441 a[besti+bestsize] == b[bestj+bestsize]: 442 bestsize = bestsize + 1 443 444 return Match(besti, bestj, bestsize) 445 446 def get_matching_blocks(self): 447 """Return list of triples describing matching subsequences. 448 449 Each triple is of the form (i, j, n), and means that 450 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in 451 i and in j. New in Python 2.5, it's also guaranteed that if 452 (i, j, n) and (i', j', n') are adjacent triples in the list, and 453 the second is not the last triple in the list, then i+n != i' or 454 j+n != j'. IOW, adjacent triples never describe adjacent equal 455 blocks. 456 457 The last triple is a dummy, (len(a), len(b), 0), and is the only 458 triple with n==0. 459 460 >>> s = SequenceMatcher(None, "abxcd", "abcd") 461 >>> list(s.get_matching_blocks()) 462 [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] 463 """ 464 465 if self.matching_blocks is not None: 466 return self.matching_blocks 467 la, lb = len(self.a), len(self.b) 468 469 # This is most naturally expressed as a recursive algorithm, but 470 # at least one user bumped into extreme use cases that exceeded 471 # the recursion limit on their box. So, now we maintain a list 472 # ('queue`) of blocks we still need to look at, and append partial 473 # results to `matching_blocks` in a loop; the matches are sorted 474 # at the end. 475 queue = [(0, la, 0, lb)] 476 matching_blocks = [] 477 while queue: 478 alo, ahi, blo, bhi = queue.pop() 479 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) 480 # a[alo:i] vs b[blo:j] unknown 481 # a[i:i+k] same as b[j:j+k] 482 # a[i+k:ahi] vs b[j+k:bhi] unknown 483 if k: # if k is 0, there was no matching block 484 matching_blocks.append(x) 485 if alo < i and blo < j: 486 queue.append((alo, i, blo, j)) 487 if i+k < ahi and j+k < bhi: 488 queue.append((i+k, ahi, j+k, bhi)) 489 matching_blocks.sort() 490 491 # It's possible that we have adjacent equal blocks in the 492 # matching_blocks list now. Starting with 2.5, this code was added 493 # to collapse them. 494 i1 = j1 = k1 = 0 495 non_adjacent = [] 496 for i2, j2, k2 in matching_blocks: 497 # Is this block adjacent to i1, j1, k1? 498 if i1 + k1 == i2 and j1 + k1 == j2: 499 # Yes, so collapse them -- this just increases the length of 500 # the first block by the length of the second, and the first 501 # block so lengthened remains the block to compare against. 502 k1 += k2 503 else: 504 # Not adjacent. Remember the first block (k1==0 means it's 505 # the dummy we started with), and make the second block the 506 # new block to compare against. 507 if k1: 508 non_adjacent.append((i1, j1, k1)) 509 i1, j1, k1 = i2, j2, k2 510 if k1: 511 non_adjacent.append((i1, j1, k1)) 512 513 non_adjacent.append( (la, lb, 0) ) 514 self.matching_blocks = list(map(Match._make, non_adjacent)) 515 return self.matching_blocks 516 517 def get_opcodes(self): 518 """Return list of 5-tuples describing how to turn a into b. 519 520 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple 521 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the 522 tuple preceding it, and likewise for j1 == the previous j2. 523 524 The tags are strings, with these meanings: 525 526 'replace': a[i1:i2] should be replaced by b[j1:j2] 527 'delete': a[i1:i2] should be deleted. 528 Note that j1==j2 in this case. 529 'insert': b[j1:j2] should be inserted at a[i1:i1]. 530 Note that i1==i2 in this case. 531 'equal': a[i1:i2] == b[j1:j2] 532 533 >>> a = "qabxcd" 534 >>> b = "abycdf" 535 >>> s = SequenceMatcher(None, a, b) 536 >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): 537 ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % 538 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))) 539 delete a[0:1] (q) b[0:0] () 540 equal a[1:3] (ab) b[0:2] (ab) 541 replace a[3:4] (x) b[2:3] (y) 542 equal a[4:6] (cd) b[3:5] (cd) 543 insert a[6:6] () b[5:6] (f) 544 """ 545 546 if self.opcodes is not None: 547 return self.opcodes 548 i = j = 0 549 self.opcodes = answer = [] 550 for ai, bj, size in self.get_matching_blocks(): 551 # invariant: we've pumped out correct diffs to change 552 # a[:i] into b[:j], and the next matching block is 553 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump 554 # out a diff to change a[i:ai] into b[j:bj], pump out 555 # the matching block, and move (i,j) beyond the match 556 tag = '' 557 if i < ai and j < bj: 558 tag = 'replace' 559 elif i < ai: 560 tag = 'delete' 561 elif j < bj: 562 tag = 'insert' 563 if tag: 564 answer.append( (tag, i, ai, j, bj) ) 565 i, j = ai+size, bj+size 566 # the list of matching blocks is terminated by a 567 # sentinel with size 0 568 if size: 569 answer.append( ('equal', ai, i, bj, j) ) 570 return answer 571 572 def get_grouped_opcodes(self, n=3): 573 """ Isolate change clusters by eliminating ranges with no changes. 574 575 Return a generator of groups with up to n lines of context. 576 Each group is in the same format as returned by get_opcodes(). 577 578 >>> from pprint import pprint 579 >>> a = list(map(str, range(1,40))) 580 >>> b = a[:] 581 >>> b[8:8] = ['i'] # Make an insertion 582 >>> b[20] += 'x' # Make a replacement 583 >>> b[23:28] = [] # Make a deletion 584 >>> b[30] += 'y' # Make another replacement 585 >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) 586 [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], 587 [('equal', 16, 19, 17, 20), 588 ('replace', 19, 20, 20, 21), 589 ('equal', 20, 22, 21, 23), 590 ('delete', 22, 27, 23, 23), 591 ('equal', 27, 30, 23, 26)], 592 [('equal', 31, 34, 27, 30), 593 ('replace', 34, 35, 30, 31), 594 ('equal', 35, 38, 31, 34)]] 595 """ 596 597 codes = self.get_opcodes() 598 if not codes: 599 codes = [("equal", 0, 1, 0, 1)] 600 # Fixup leading and trailing groups if they show no changes. 601 if codes[0][0] == 'equal': 602 tag, i1, i2, j1, j2 = codes[0] 603 codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 604 if codes[-1][0] == 'equal': 605 tag, i1, i2, j1, j2 = codes[-1] 606 codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) 607 608 nn = n + n 609 group = [] 610 for tag, i1, i2, j1, j2 in codes: 611 # End the current group and start a new one whenever 612 # there is a large range with no changes. 613 if tag == 'equal' and i2-i1 > nn: 614 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) 615 yield group 616 group = [] 617 i1, j1 = max(i1, i2-n), max(j1, j2-n) 618 group.append((tag, i1, i2, j1 ,j2)) 619 if group and not (len(group)==1 and group[0][0] == 'equal'): 620 yield group 621 622 def ratio(self): 623 """Return a measure of the sequences' similarity (float in [0,1]). 624 625 Where T is the total number of elements in both sequences, and 626 M is the number of matches, this is 2.0*M / T. 627 Note that this is 1 if the sequences are identical, and 0 if 628 they have nothing in common. 629 630 .ratio() is expensive to compute if you haven't already computed 631 .get_matching_blocks() or .get_opcodes(), in which case you may 632 want to try .quick_ratio() or .real_quick_ratio() first to get an 633 upper bound. 634 635 >>> s = SequenceMatcher(None, "abcd", "bcde") 636 >>> s.ratio() 637 0.75 638 >>> s.quick_ratio() 639 0.75 640 >>> s.real_quick_ratio() 641 1.0 642 """ 643 644 matches = sum(triple[-1] for triple in self.get_matching_blocks()) 645 return _calculate_ratio(matches, len(self.a) + len(self.b)) 646 647 def quick_ratio(self): 648 """Return an upper bound on ratio() relatively quickly. 649 650 This isn't defined beyond that it is an upper bound on .ratio(), and 651 is faster to compute. 652 """ 653 654 # viewing a and b as multisets, set matches to the cardinality 655 # of their intersection; this counts the number of matches 656 # without regard to order, so is clearly an upper bound 657 if self.fullbcount is None: 658 self.fullbcount = fullbcount = {} 659 for elt in self.b: 660 fullbcount[elt] = fullbcount.get(elt, 0) + 1 661 fullbcount = self.fullbcount 662 # avail[x] is the number of times x appears in 'b' less the 663 # number of times we've seen it in 'a' so far ... kinda 664 avail = {} 665 availhas, matches = avail.__contains__, 0 666 for elt in self.a: 667 if availhas(elt): 668 numb = avail[elt] 669 else: 670 numb = fullbcount.get(elt, 0) 671 avail[elt] = numb - 1 672 if numb > 0: 673 matches = matches + 1 674 return _calculate_ratio(matches, len(self.a) + len(self.b)) 675 676 def real_quick_ratio(self): 677 """Return an upper bound on ratio() very quickly. 678 679 This isn't defined beyond that it is an upper bound on .ratio(), and 680 is faster to compute than either .ratio() or .quick_ratio(). 681 """ 682 683 la, lb = len(self.a), len(self.b) 684 # can't have more matches than the number of elements in the 685 # shorter sequence 686 return _calculate_ratio(min(la, lb), la + lb) 687 688def get_close_matches(word, possibilities, n=3, cutoff=0.6): 689 """Use SequenceMatcher to return list of the best "good enough" matches. 690 691 word is a sequence for which close matches are desired (typically a 692 string). 693 694 possibilities is a list of sequences against which to match word 695 (typically a list of strings). 696 697 Optional arg n (default 3) is the maximum number of close matches to 698 return. n must be > 0. 699 700 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities 701 that don't score at least that similar to word are ignored. 702 703 The best (no more than n) matches among the possibilities are returned 704 in a list, sorted by similarity score, most similar first. 705 706 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) 707 ['apple', 'ape'] 708 >>> import keyword as _keyword 709 >>> get_close_matches("wheel", _keyword.kwlist) 710 ['while'] 711 >>> get_close_matches("Apple", _keyword.kwlist) 712 [] 713 >>> get_close_matches("accept", _keyword.kwlist) 714 ['except'] 715 """ 716 717 if not n > 0: 718 raise ValueError("n must be > 0: %r" % (n,)) 719 if not 0.0 <= cutoff <= 1.0: 720 raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) 721 result = [] 722 s = SequenceMatcher() 723 s.set_seq2(word) 724 for x in possibilities: 725 s.set_seq1(x) 726 if s.real_quick_ratio() >= cutoff and \ 727 s.quick_ratio() >= cutoff and \ 728 s.ratio() >= cutoff: 729 result.append((s.ratio(), x)) 730 731 # Move the best scorers to head of list 732 result = _nlargest(n, result) 733 # Strip scores for the best n matches 734 return [x for score, x in result] 735 736def _count_leading(line, ch): 737 """ 738 Return number of `ch` characters at the start of `line`. 739 740 Example: 741 742 >>> _count_leading(' abc', ' ') 743 3 744 """ 745 746 i, n = 0, len(line) 747 while i < n and line[i] == ch: 748 i += 1 749 return i 750 751class Differ: 752 r""" 753 Differ is a class for comparing sequences of lines of text, and 754 producing human-readable differences or deltas. Differ uses 755 SequenceMatcher both to compare sequences of lines, and to compare 756 sequences of characters within similar (near-matching) lines. 757 758 Each line of a Differ delta begins with a two-letter code: 759 760 '- ' line unique to sequence 1 761 '+ ' line unique to sequence 2 762 ' ' line common to both sequences 763 '? ' line not present in either input sequence 764 765 Lines beginning with '? ' attempt to guide the eye to intraline 766 differences, and were not present in either input sequence. These lines 767 can be confusing if the sequences contain tab characters. 768 769 Note that Differ makes no claim to produce a *minimal* diff. To the 770 contrary, minimal diffs are often counter-intuitive, because they synch 771 up anywhere possible, sometimes accidental matches 100 pages apart. 772 Restricting synch points to contiguous matches preserves some notion of 773 locality, at the occasional cost of producing a longer diff. 774 775 Example: Comparing two texts. 776 777 First we set up the texts, sequences of individual single-line strings 778 ending with newlines (such sequences can also be obtained from the 779 `readlines()` method of file-like objects): 780 781 >>> text1 = ''' 1. Beautiful is better than ugly. 782 ... 2. Explicit is better than implicit. 783 ... 3. Simple is better than complex. 784 ... 4. Complex is better than complicated. 785 ... '''.splitlines(keepends=True) 786 >>> len(text1) 787 4 788 >>> text1[0][-1] 789 '\n' 790 >>> text2 = ''' 1. Beautiful is better than ugly. 791 ... 3. Simple is better than complex. 792 ... 4. Complicated is better than complex. 793 ... 5. Flat is better than nested. 794 ... '''.splitlines(keepends=True) 795 796 Next we instantiate a Differ object: 797 798 >>> d = Differ() 799 800 Note that when instantiating a Differ object we may pass functions to 801 filter out line and character 'junk'. See Differ.__init__ for details. 802 803 Finally, we compare the two: 804 805 >>> result = list(d.compare(text1, text2)) 806 807 'result' is a list of strings, so let's pretty-print it: 808 809 >>> from pprint import pprint as _pprint 810 >>> _pprint(result) 811 [' 1. Beautiful is better than ugly.\n', 812 '- 2. Explicit is better than implicit.\n', 813 '- 3. Simple is better than complex.\n', 814 '+ 3. Simple is better than complex.\n', 815 '? ++\n', 816 '- 4. Complex is better than complicated.\n', 817 '? ^ ---- ^\n', 818 '+ 4. Complicated is better than complex.\n', 819 '? ++++ ^ ^\n', 820 '+ 5. Flat is better than nested.\n'] 821 822 As a single multi-line string it looks like this: 823 824 >>> print(''.join(result), end="") 825 1. Beautiful is better than ugly. 826 - 2. Explicit is better than implicit. 827 - 3. Simple is better than complex. 828 + 3. Simple is better than complex. 829 ? ++ 830 - 4. Complex is better than complicated. 831 ? ^ ---- ^ 832 + 4. Complicated is better than complex. 833 ? ++++ ^ ^ 834 + 5. Flat is better than nested. 835 836 Methods: 837 838 __init__(linejunk=None, charjunk=None) 839 Construct a text differencer, with optional filters. 840 841 compare(a, b) 842 Compare two sequences of lines; generate the resulting delta. 843 """ 844 845 def __init__(self, linejunk=None, charjunk=None): 846 """ 847 Construct a text differencer, with optional filters. 848 849 The two optional keyword parameters are for filter functions: 850 851 - `linejunk`: A function that should accept a single string argument, 852 and return true iff the string is junk. The module-level function 853 `IS_LINE_JUNK` may be used to filter out lines without visible 854 characters, except for at most one splat ('#'). It is recommended 855 to leave linejunk None; the underlying SequenceMatcher class has 856 an adaptive notion of "noise" lines that's better than any static 857 definition the author has ever been able to craft. 858 859 - `charjunk`: A function that should accept a string of length 1. The 860 module-level function `IS_CHARACTER_JUNK` may be used to filter out 861 whitespace characters (a blank or tab; **note**: bad idea to include 862 newline in this!). Use of IS_CHARACTER_JUNK is recommended. 863 """ 864 865 self.linejunk = linejunk 866 self.charjunk = charjunk 867 868 def compare(self, a, b): 869 r""" 870 Compare two sequences of lines; generate the resulting delta. 871 872 Each sequence must contain individual single-line strings ending with 873 newlines. Such sequences can be obtained from the `readlines()` method 874 of file-like objects. The delta generated also consists of newline- 875 terminated strings, ready to be printed as-is via the writeline() 876 method of a file-like object. 877 878 Example: 879 880 >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True), 881 ... 'ore\ntree\nemu\n'.splitlines(True))), 882 ... end="") 883 - one 884 ? ^ 885 + ore 886 ? ^ 887 - two 888 - three 889 ? - 890 + tree 891 + emu 892 """ 893 894 cruncher = SequenceMatcher(self.linejunk, a, b) 895 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): 896 if tag == 'replace': 897 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 898 elif tag == 'delete': 899 g = self._dump('-', a, alo, ahi) 900 elif tag == 'insert': 901 g = self._dump('+', b, blo, bhi) 902 elif tag == 'equal': 903 g = self._dump(' ', a, alo, ahi) 904 else: 905 raise ValueError('unknown tag %r' % (tag,)) 906 907 yield from g 908 909 def _dump(self, tag, x, lo, hi): 910 """Generate comparison results for a same-tagged range.""" 911 for i in range(lo, hi): 912 yield '%s %s' % (tag, x[i]) 913 914 def _plain_replace(self, a, alo, ahi, b, blo, bhi): 915 assert alo < ahi and blo < bhi 916 # dump the shorter block first -- reduces the burden on short-term 917 # memory if the blocks are of very different sizes 918 if bhi - blo < ahi - alo: 919 first = self._dump('+', b, blo, bhi) 920 second = self._dump('-', a, alo, ahi) 921 else: 922 first = self._dump('-', a, alo, ahi) 923 second = self._dump('+', b, blo, bhi) 924 925 for g in first, second: 926 yield from g 927 928 def _fancy_replace(self, a, alo, ahi, b, blo, bhi): 929 r""" 930 When replacing one block of lines with another, search the blocks 931 for *similar* lines; the best-matching pair (if any) is used as a 932 synch point, and intraline difference marking is done on the 933 similar pair. Lots of work, but often worth it. 934 935 Example: 936 937 >>> d = Differ() 938 >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, 939 ... ['abcdefGhijkl\n'], 0, 1) 940 >>> print(''.join(results), end="") 941 - abcDefghiJkl 942 ? ^ ^ ^ 943 + abcdefGhijkl 944 ? ^ ^ ^ 945 """ 946 947 # don't synch up unless the lines have a similarity score of at 948 # least cutoff; best_ratio tracks the best score seen so far 949 best_ratio, cutoff = 0.74, 0.75 950 cruncher = SequenceMatcher(self.charjunk) 951 eqi, eqj = None, None # 1st indices of equal lines (if any) 952 953 # search for the pair that matches best without being identical 954 # (identical lines must be junk lines, & we don't want to synch up 955 # on junk -- unless we have to) 956 for j in range(blo, bhi): 957 bj = b[j] 958 cruncher.set_seq2(bj) 959 for i in range(alo, ahi): 960 ai = a[i] 961 if ai == bj: 962 if eqi is None: 963 eqi, eqj = i, j 964 continue 965 cruncher.set_seq1(ai) 966 # computing similarity is expensive, so use the quick 967 # upper bounds first -- have seen this speed up messy 968 # compares by a factor of 3. 969 # note that ratio() is only expensive to compute the first 970 # time it's called on a sequence pair; the expensive part 971 # of the computation is cached by cruncher 972 if cruncher.real_quick_ratio() > best_ratio and \ 973 cruncher.quick_ratio() > best_ratio and \ 974 cruncher.ratio() > best_ratio: 975 best_ratio, best_i, best_j = cruncher.ratio(), i, j 976 if best_ratio < cutoff: 977 # no non-identical "pretty close" pair 978 if eqi is None: 979 # no identical pair either -- treat it as a straight replace 980 yield from self._plain_replace(a, alo, ahi, b, blo, bhi) 981 return 982 # no close pair, but an identical pair -- synch up on that 983 best_i, best_j, best_ratio = eqi, eqj, 1.0 984 else: 985 # there's a close pair, so forget the identical pair (if any) 986 eqi = None 987 988 # a[best_i] very similar to b[best_j]; eqi is None iff they're not 989 # identical 990 991 # pump out diffs from before the synch point 992 yield from self._fancy_helper(a, alo, best_i, b, blo, best_j) 993 994 # do intraline marking on the synch pair 995 aelt, belt = a[best_i], b[best_j] 996 if eqi is None: 997 # pump out a '-', '?', '+', '?' quad for the synched lines 998 atags = btags = "" 999 cruncher.set_seqs(aelt, belt) 1000 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): 1001 la, lb = ai2 - ai1, bj2 - bj1 1002 if tag == 'replace': 1003 atags += '^' * la 1004 btags += '^' * lb 1005 elif tag == 'delete': 1006 atags += '-' * la 1007 elif tag == 'insert': 1008 btags += '+' * lb 1009 elif tag == 'equal': 1010 atags += ' ' * la 1011 btags += ' ' * lb 1012 else: 1013 raise ValueError('unknown tag %r' % (tag,)) 1014 yield from self._qformat(aelt, belt, atags, btags) 1015 else: 1016 # the synch pair is identical 1017 yield ' ' + aelt 1018 1019 # pump out diffs from after the synch point 1020 yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi) 1021 1022 def _fancy_helper(self, a, alo, ahi, b, blo, bhi): 1023 g = [] 1024 if alo < ahi: 1025 if blo < bhi: 1026 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 1027 else: 1028 g = self._dump('-', a, alo, ahi) 1029 elif blo < bhi: 1030 g = self._dump('+', b, blo, bhi) 1031 1032 yield from g 1033 1034 def _qformat(self, aline, bline, atags, btags): 1035 r""" 1036 Format "?" output and deal with leading tabs. 1037 1038 Example: 1039 1040 >>> d = Differ() 1041 >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n', 1042 ... ' ^ ^ ^ ', ' ^ ^ ^ ') 1043 >>> for line in results: print(repr(line)) 1044 ... 1045 '- \tabcDefghiJkl\n' 1046 '? \t ^ ^ ^\n' 1047 '+ \tabcdefGhijkl\n' 1048 '? \t ^ ^ ^\n' 1049 """ 1050 1051 # Can hurt, but will probably help most of the time. 1052 common = min(_count_leading(aline, "\t"), 1053 _count_leading(bline, "\t")) 1054 common = min(common, _count_leading(atags[:common], " ")) 1055 common = min(common, _count_leading(btags[:common], " ")) 1056 atags = atags[common:].rstrip() 1057 btags = btags[common:].rstrip() 1058 1059 yield "- " + aline 1060 if atags: 1061 yield "? %s%s\n" % ("\t" * common, atags) 1062 1063 yield "+ " + bline 1064 if btags: 1065 yield "? %s%s\n" % ("\t" * common, btags) 1066 1067# With respect to junk, an earlier version of ndiff simply refused to 1068# *start* a match with a junk element. The result was cases like this: 1069# before: private Thread currentThread; 1070# after: private volatile Thread currentThread; 1071# If you consider whitespace to be junk, the longest contiguous match 1072# not starting with junk is "e Thread currentThread". So ndiff reported 1073# that "e volatil" was inserted between the 't' and the 'e' in "private". 1074# While an accurate view, to people that's absurd. The current version 1075# looks for matching blocks that are entirely junk-free, then extends the 1076# longest one of those as far as possible but only with matching junk. 1077# So now "currentThread" is matched, then extended to suck up the 1078# preceding blank; then "private" is matched, and extended to suck up the 1079# following blank; then "Thread" is matched; and finally ndiff reports 1080# that "volatile " was inserted before "Thread". The only quibble 1081# remaining is that perhaps it was really the case that " volatile" 1082# was inserted after "private". I can live with that <wink>. 1083 1084import re 1085 1086def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match): 1087 r""" 1088 Return 1 for ignorable line: iff `line` is blank or contains a single '#'. 1089 1090 Examples: 1091 1092 >>> IS_LINE_JUNK('\n') 1093 True 1094 >>> IS_LINE_JUNK(' # \n') 1095 True 1096 >>> IS_LINE_JUNK('hello\n') 1097 False 1098 """ 1099 1100 return pat(line) is not None 1101 1102def IS_CHARACTER_JUNK(ch, ws=" \t"): 1103 r""" 1104 Return 1 for ignorable character: iff `ch` is a space or tab. 1105 1106 Examples: 1107 1108 >>> IS_CHARACTER_JUNK(' ') 1109 True 1110 >>> IS_CHARACTER_JUNK('\t') 1111 True 1112 >>> IS_CHARACTER_JUNK('\n') 1113 False 1114 >>> IS_CHARACTER_JUNK('x') 1115 False 1116 """ 1117 1118 return ch in ws 1119 1120 1121######################################################################## 1122### Unified Diff 1123######################################################################## 1124 1125def _format_range_unified(start, stop): 1126 'Convert range to the "ed" format' 1127 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1128 beginning = start + 1 # lines start numbering with one 1129 length = stop - start 1130 if length == 1: 1131 return '{}'.format(beginning) 1132 if not length: 1133 beginning -= 1 # empty ranges begin at line just before the range 1134 return '{},{}'.format(beginning, length) 1135 1136def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 1137 tofiledate='', n=3, lineterm='\n'): 1138 r""" 1139 Compare two sequences of lines; generate the delta as a unified diff. 1140 1141 Unified diffs are a compact way of showing line changes and a few 1142 lines of context. The number of context lines is set by 'n' which 1143 defaults to three. 1144 1145 By default, the diff control lines (those with ---, +++, or @@) are 1146 created with a trailing newline. This is helpful so that inputs 1147 created from file.readlines() result in diffs that are suitable for 1148 file.writelines() since both the inputs and outputs have trailing 1149 newlines. 1150 1151 For inputs that do not have trailing newlines, set the lineterm 1152 argument to "" so that the output will be uniformly newline free. 1153 1154 The unidiff format normally has a header for filenames and modification 1155 times. Any or all of these may be specified using strings for 1156 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1157 The modification times are normally expressed in the ISO 8601 format. 1158 1159 Example: 1160 1161 >>> for line in unified_diff('one two three four'.split(), 1162 ... 'zero one tree four'.split(), 'Original', 'Current', 1163 ... '2005-01-26 23:30:50', '2010-04-02 10:20:52', 1164 ... lineterm=''): 1165 ... print(line) # doctest: +NORMALIZE_WHITESPACE 1166 --- Original 2005-01-26 23:30:50 1167 +++ Current 2010-04-02 10:20:52 1168 @@ -1,4 +1,4 @@ 1169 +zero 1170 one 1171 -two 1172 -three 1173 +tree 1174 four 1175 """ 1176 1177 _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) 1178 started = False 1179 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1180 if not started: 1181 started = True 1182 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1183 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1184 yield '--- {}{}{}'.format(fromfile, fromdate, lineterm) 1185 yield '+++ {}{}{}'.format(tofile, todate, lineterm) 1186 1187 first, last = group[0], group[-1] 1188 file1_range = _format_range_unified(first[1], last[2]) 1189 file2_range = _format_range_unified(first[3], last[4]) 1190 yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm) 1191 1192 for tag, i1, i2, j1, j2 in group: 1193 if tag == 'equal': 1194 for line in a[i1:i2]: 1195 yield ' ' + line 1196 continue 1197 if tag in {'replace', 'delete'}: 1198 for line in a[i1:i2]: 1199 yield '-' + line 1200 if tag in {'replace', 'insert'}: 1201 for line in b[j1:j2]: 1202 yield '+' + line 1203 1204 1205######################################################################## 1206### Context Diff 1207######################################################################## 1208 1209def _format_range_context(start, stop): 1210 'Convert range to the "ed" format' 1211 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1212 beginning = start + 1 # lines start numbering with one 1213 length = stop - start 1214 if not length: 1215 beginning -= 1 # empty ranges begin at line just before the range 1216 if length <= 1: 1217 return '{}'.format(beginning) 1218 return '{},{}'.format(beginning, beginning + length - 1) 1219 1220# See http://www.unix.org/single_unix_specification/ 1221def context_diff(a, b, fromfile='', tofile='', 1222 fromfiledate='', tofiledate='', n=3, lineterm='\n'): 1223 r""" 1224 Compare two sequences of lines; generate the delta as a context diff. 1225 1226 Context diffs are a compact way of showing line changes and a few 1227 lines of context. The number of context lines is set by 'n' which 1228 defaults to three. 1229 1230 By default, the diff control lines (those with *** or ---) are 1231 created with a trailing newline. This is helpful so that inputs 1232 created from file.readlines() result in diffs that are suitable for 1233 file.writelines() since both the inputs and outputs have trailing 1234 newlines. 1235 1236 For inputs that do not have trailing newlines, set the lineterm 1237 argument to "" so that the output will be uniformly newline free. 1238 1239 The context diff format normally has a header for filenames and 1240 modification times. Any or all of these may be specified using 1241 strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1242 The modification times are normally expressed in the ISO 8601 format. 1243 If not specified, the strings default to blanks. 1244 1245 Example: 1246 1247 >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True), 1248 ... 'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')), 1249 ... end="") 1250 *** Original 1251 --- Current 1252 *************** 1253 *** 1,4 **** 1254 one 1255 ! two 1256 ! three 1257 four 1258 --- 1,4 ---- 1259 + zero 1260 one 1261 ! tree 1262 four 1263 """ 1264 1265 _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) 1266 prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') 1267 started = False 1268 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1269 if not started: 1270 started = True 1271 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1272 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1273 yield '*** {}{}{}'.format(fromfile, fromdate, lineterm) 1274 yield '--- {}{}{}'.format(tofile, todate, lineterm) 1275 1276 first, last = group[0], group[-1] 1277 yield '***************' + lineterm 1278 1279 file1_range = _format_range_context(first[1], last[2]) 1280 yield '*** {} ****{}'.format(file1_range, lineterm) 1281 1282 if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group): 1283 for tag, i1, i2, _, _ in group: 1284 if tag != 'insert': 1285 for line in a[i1:i2]: 1286 yield prefix[tag] + line 1287 1288 file2_range = _format_range_context(first[3], last[4]) 1289 yield '--- {} ----{}'.format(file2_range, lineterm) 1290 1291 if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group): 1292 for tag, _, _, j1, j2 in group: 1293 if tag != 'delete': 1294 for line in b[j1:j2]: 1295 yield prefix[tag] + line 1296 1297def _check_types(a, b, *args): 1298 # Checking types is weird, but the alternative is garbled output when 1299 # someone passes mixed bytes and str to {unified,context}_diff(). E.g. 1300 # without this check, passing filenames as bytes results in output like 1301 # --- b'oldfile.txt' 1302 # +++ b'newfile.txt' 1303 # because of how str.format() incorporates bytes objects. 1304 if a and not isinstance(a[0], str): 1305 raise TypeError('lines to compare must be str, not %s (%r)' % 1306 (type(a[0]).__name__, a[0])) 1307 if b and not isinstance(b[0], str): 1308 raise TypeError('lines to compare must be str, not %s (%r)' % 1309 (type(b[0]).__name__, b[0])) 1310 for arg in args: 1311 if not isinstance(arg, str): 1312 raise TypeError('all arguments must be str, not: %r' % (arg,)) 1313 1314def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', 1315 fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'): 1316 r""" 1317 Compare `a` and `b`, two sequences of lines represented as bytes rather 1318 than str. This is a wrapper for `dfunc`, which is typically either 1319 unified_diff() or context_diff(). Inputs are losslessly converted to 1320 strings so that `dfunc` only has to worry about strings, and encoded 1321 back to bytes on return. This is necessary to compare files with 1322 unknown or inconsistent encoding. All other inputs (except `n`) must be 1323 bytes rather than str. 1324 """ 1325 def decode(s): 1326 try: 1327 return s.decode('ascii', 'surrogateescape') 1328 except AttributeError as err: 1329 msg = ('all arguments must be bytes, not %s (%r)' % 1330 (type(s).__name__, s)) 1331 raise TypeError(msg) from err 1332 a = list(map(decode, a)) 1333 b = list(map(decode, b)) 1334 fromfile = decode(fromfile) 1335 tofile = decode(tofile) 1336 fromfiledate = decode(fromfiledate) 1337 tofiledate = decode(tofiledate) 1338 lineterm = decode(lineterm) 1339 1340 lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm) 1341 for line in lines: 1342 yield line.encode('ascii', 'surrogateescape') 1343 1344def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): 1345 r""" 1346 Compare `a` and `b` (lists of strings); return a `Differ`-style delta. 1347 1348 Optional keyword parameters `linejunk` and `charjunk` are for filter 1349 functions, or can be None: 1350 1351 - linejunk: A function that should accept a single string argument and 1352 return true iff the string is junk. The default is None, and is 1353 recommended; the underlying SequenceMatcher class has an adaptive 1354 notion of "noise" lines. 1355 1356 - charjunk: A function that accepts a character (string of length 1357 1), and returns true iff the character is junk. The default is 1358 the module-level function IS_CHARACTER_JUNK, which filters out 1359 whitespace characters (a blank or tab; note: it's a bad idea to 1360 include newline in this!). 1361 1362 Tools/scripts/ndiff.py is a command-line front-end to this function. 1363 1364 Example: 1365 1366 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True), 1367 ... 'ore\ntree\nemu\n'.splitlines(keepends=True)) 1368 >>> print(''.join(diff), end="") 1369 - one 1370 ? ^ 1371 + ore 1372 ? ^ 1373 - two 1374 - three 1375 ? - 1376 + tree 1377 + emu 1378 """ 1379 return Differ(linejunk, charjunk).compare(a, b) 1380 1381def _mdiff(fromlines, tolines, context=None, linejunk=None, 1382 charjunk=IS_CHARACTER_JUNK): 1383 r"""Returns generator yielding marked up from/to side by side differences. 1384 1385 Arguments: 1386 fromlines -- list of text lines to compared to tolines 1387 tolines -- list of text lines to be compared to fromlines 1388 context -- number of context lines to display on each side of difference, 1389 if None, all from/to text lines will be generated. 1390 linejunk -- passed on to ndiff (see ndiff documentation) 1391 charjunk -- passed on to ndiff (see ndiff documentation) 1392 1393 This function returns an iterator which returns a tuple: 1394 (from line tuple, to line tuple, boolean flag) 1395 1396 from/to line tuple -- (line num, line text) 1397 line num -- integer or None (to indicate a context separation) 1398 line text -- original line text with following markers inserted: 1399 '\0+' -- marks start of added text 1400 '\0-' -- marks start of deleted text 1401 '\0^' -- marks start of changed text 1402 '\1' -- marks end of added/deleted/changed text 1403 1404 boolean flag -- None indicates context separation, True indicates 1405 either "from" or "to" line contains a change, otherwise False. 1406 1407 This function/iterator was originally developed to generate side by side 1408 file difference for making HTML pages (see HtmlDiff class for example 1409 usage). 1410 1411 Note, this function utilizes the ndiff function to generate the side by 1412 side difference markup. Optional ndiff arguments may be passed to this 1413 function and they in turn will be passed to ndiff. 1414 """ 1415 import re 1416 1417 # regular expression for finding intraline change indices 1418 change_re = re.compile(r'(\++|\-+|\^+)') 1419 1420 # create the difference iterator to generate the differences 1421 diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) 1422 1423 def _make_line(lines, format_key, side, num_lines=[0,0]): 1424 """Returns line of text with user's change markup and line formatting. 1425 1426 lines -- list of lines from the ndiff generator to produce a line of 1427 text from. When producing the line of text to return, the 1428 lines used are removed from this list. 1429 format_key -- '+' return first line in list with "add" markup around 1430 the entire line. 1431 '-' return first line in list with "delete" markup around 1432 the entire line. 1433 '?' return first line in list with add/delete/change 1434 intraline markup (indices obtained from second line) 1435 None return first line in list with no markup 1436 side -- indice into the num_lines list (0=from,1=to) 1437 num_lines -- from/to current line number. This is NOT intended to be a 1438 passed parameter. It is present as a keyword argument to 1439 maintain memory of the current line numbers between calls 1440 of this function. 1441 1442 Note, this function is purposefully not defined at the module scope so 1443 that data it needs from its parent function (within whose context it 1444 is defined) does not need to be of module scope. 1445 """ 1446 num_lines[side] += 1 1447 # Handle case where no user markup is to be added, just return line of 1448 # text with user's line format to allow for usage of the line number. 1449 if format_key is None: 1450 return (num_lines[side],lines.pop(0)[2:]) 1451 # Handle case of intraline changes 1452 if format_key == '?': 1453 text, markers = lines.pop(0), lines.pop(0) 1454 # find intraline changes (store change type and indices in tuples) 1455 sub_info = [] 1456 def record_sub_info(match_object,sub_info=sub_info): 1457 sub_info.append([match_object.group(1)[0],match_object.span()]) 1458 return match_object.group(1) 1459 change_re.sub(record_sub_info,markers) 1460 # process each tuple inserting our special marks that won't be 1461 # noticed by an xml/html escaper. 1462 for key,(begin,end) in reversed(sub_info): 1463 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:] 1464 text = text[2:] 1465 # Handle case of add/delete entire line 1466 else: 1467 text = lines.pop(0)[2:] 1468 # if line of text is just a newline, insert a space so there is 1469 # something for the user to highlight and see. 1470 if not text: 1471 text = ' ' 1472 # insert marks that won't be noticed by an xml/html escaper. 1473 text = '\0' + format_key + text + '\1' 1474 # Return line of text, first allow user's line formatter to do its 1475 # thing (such as adding the line number) then replace the special 1476 # marks with what the user's change markup. 1477 return (num_lines[side],text) 1478 1479 def _line_iterator(): 1480 """Yields from/to lines of text with a change indication. 1481 1482 This function is an iterator. It itself pulls lines from a 1483 differencing iterator, processes them and yields them. When it can 1484 it yields both a "from" and a "to" line, otherwise it will yield one 1485 or the other. In addition to yielding the lines of from/to text, a 1486 boolean flag is yielded to indicate if the text line(s) have 1487 differences in them. 1488 1489 Note, this function is purposefully not defined at the module scope so 1490 that data it needs from its parent function (within whose context it 1491 is defined) does not need to be of module scope. 1492 """ 1493 lines = [] 1494 num_blanks_pending, num_blanks_to_yield = 0, 0 1495 while True: 1496 # Load up next 4 lines so we can look ahead, create strings which 1497 # are a concatenation of the first character of each of the 4 lines 1498 # so we can do some very readable comparisons. 1499 while len(lines) < 4: 1500 lines.append(next(diff_lines_iterator, 'X')) 1501 s = ''.join([line[0] for line in lines]) 1502 if s.startswith('X'): 1503 # When no more lines, pump out any remaining blank lines so the 1504 # corresponding add/delete lines get a matching blank line so 1505 # all line pairs get yielded at the next level. 1506 num_blanks_to_yield = num_blanks_pending 1507 elif s.startswith('-?+?'): 1508 # simple intraline change 1509 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True 1510 continue 1511 elif s.startswith('--++'): 1512 # in delete block, add block coming: we do NOT want to get 1513 # caught up on blank lines yet, just process the delete line 1514 num_blanks_pending -= 1 1515 yield _make_line(lines,'-',0), None, True 1516 continue 1517 elif s.startswith(('--?+', '--+', '- ')): 1518 # in delete block and see an intraline change or unchanged line 1519 # coming: yield the delete line and then blanks 1520 from_line,to_line = _make_line(lines,'-',0), None 1521 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0 1522 elif s.startswith('-+?'): 1523 # intraline change 1524 yield _make_line(lines,None,0), _make_line(lines,'?',1), True 1525 continue 1526 elif s.startswith('-?+'): 1527 # intraline change 1528 yield _make_line(lines,'?',0), _make_line(lines,None,1), True 1529 continue 1530 elif s.startswith('-'): 1531 # delete FROM line 1532 num_blanks_pending -= 1 1533 yield _make_line(lines,'-',0), None, True 1534 continue 1535 elif s.startswith('+--'): 1536 # in add block, delete block coming: we do NOT want to get 1537 # caught up on blank lines yet, just process the add line 1538 num_blanks_pending += 1 1539 yield None, _make_line(lines,'+',1), True 1540 continue 1541 elif s.startswith(('+ ', '+-')): 1542 # will be leaving an add block: yield blanks then add line 1543 from_line, to_line = None, _make_line(lines,'+',1) 1544 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0 1545 elif s.startswith('+'): 1546 # inside an add block, yield the add line 1547 num_blanks_pending += 1 1548 yield None, _make_line(lines,'+',1), True 1549 continue 1550 elif s.startswith(' '): 1551 # unchanged text, yield it to both sides 1552 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False 1553 continue 1554 # Catch up on the blank lines so when we yield the next from/to 1555 # pair, they are lined up. 1556 while(num_blanks_to_yield < 0): 1557 num_blanks_to_yield += 1 1558 yield None,('','\n'),True 1559 while(num_blanks_to_yield > 0): 1560 num_blanks_to_yield -= 1 1561 yield ('','\n'),None,True 1562 if s.startswith('X'): 1563 return 1564 else: 1565 yield from_line,to_line,True 1566 1567 def _line_pair_iterator(): 1568 """Yields from/to lines of text with a change indication. 1569 1570 This function is an iterator. It itself pulls lines from the line 1571 iterator. Its difference from that iterator is that this function 1572 always yields a pair of from/to text lines (with the change 1573 indication). If necessary it will collect single from/to lines 1574 until it has a matching pair from/to pair to yield. 1575 1576 Note, this function is purposefully not defined at the module scope so 1577 that data it needs from its parent function (within whose context it 1578 is defined) does not need to be of module scope. 1579 """ 1580 line_iterator = _line_iterator() 1581 fromlines,tolines=[],[] 1582 while True: 1583 # Collecting lines of text until we have a from/to pair 1584 while (len(fromlines)==0 or len(tolines)==0): 1585 try: 1586 from_line, to_line, found_diff = next(line_iterator) 1587 except StopIteration: 1588 return 1589 if from_line is not None: 1590 fromlines.append((from_line,found_diff)) 1591 if to_line is not None: 1592 tolines.append((to_line,found_diff)) 1593 # Once we have a pair, remove them from the collection and yield it 1594 from_line, fromDiff = fromlines.pop(0) 1595 to_line, to_diff = tolines.pop(0) 1596 yield (from_line,to_line,fromDiff or to_diff) 1597 1598 # Handle case where user does not want context differencing, just yield 1599 # them up without doing anything else with them. 1600 line_pair_iterator = _line_pair_iterator() 1601 if context is None: 1602 yield from line_pair_iterator 1603 # Handle case where user wants context differencing. We must do some 1604 # storage of lines until we know for sure that they are to be yielded. 1605 else: 1606 context += 1 1607 lines_to_write = 0 1608 while True: 1609 # Store lines up until we find a difference, note use of a 1610 # circular queue because we only need to keep around what 1611 # we need for context. 1612 index, contextLines = 0, [None]*(context) 1613 found_diff = False 1614 while(found_diff is False): 1615 try: 1616 from_line, to_line, found_diff = next(line_pair_iterator) 1617 except StopIteration: 1618 return 1619 i = index % context 1620 contextLines[i] = (from_line, to_line, found_diff) 1621 index += 1 1622 # Yield lines that we have collected so far, but first yield 1623 # the user's separator. 1624 if index > context: 1625 yield None, None, None 1626 lines_to_write = context 1627 else: 1628 lines_to_write = index 1629 index = 0 1630 while(lines_to_write): 1631 i = index % context 1632 index += 1 1633 yield contextLines[i] 1634 lines_to_write -= 1 1635 # Now yield the context lines after the change 1636 lines_to_write = context-1 1637 try: 1638 while(lines_to_write): 1639 from_line, to_line, found_diff = next(line_pair_iterator) 1640 # If another change within the context, extend the context 1641 if found_diff: 1642 lines_to_write = context-1 1643 else: 1644 lines_to_write -= 1 1645 yield from_line, to_line, found_diff 1646 except StopIteration: 1647 # Catch exception from next() and return normally 1648 return 1649 1650 1651_file_template = """ 1652<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 1653 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 1654 1655<html> 1656 1657<head> 1658 <meta http-equiv="Content-Type" 1659 content="text/html; charset=%(charset)s" /> 1660 <title></title> 1661 <style type="text/css">%(styles)s 1662 </style> 1663</head> 1664 1665<body> 1666 %(table)s%(legend)s 1667</body> 1668 1669</html>""" 1670 1671_styles = """ 1672 table.diff {font-family:Courier; border:medium;} 1673 .diff_header {background-color:#e0e0e0} 1674 td.diff_header {text-align:right} 1675 .diff_next {background-color:#c0c0c0} 1676 .diff_add {background-color:#aaffaa} 1677 .diff_chg {background-color:#ffff77} 1678 .diff_sub {background-color:#ffaaaa}""" 1679 1680_table_template = """ 1681 <table class="diff" id="difflib_chg_%(prefix)s_top" 1682 cellspacing="0" cellpadding="0" rules="groups" > 1683 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1684 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1685 %(header_row)s 1686 <tbody> 1687%(data_rows)s </tbody> 1688 </table>""" 1689 1690_legend = """ 1691 <table class="diff" summary="Legends"> 1692 <tr> <th colspan="2"> Legends </th> </tr> 1693 <tr> <td> <table border="" summary="Colors"> 1694 <tr><th> Colors </th> </tr> 1695 <tr><td class="diff_add"> Added </td></tr> 1696 <tr><td class="diff_chg">Changed</td> </tr> 1697 <tr><td class="diff_sub">Deleted</td> </tr> 1698 </table></td> 1699 <td> <table border="" summary="Links"> 1700 <tr><th colspan="2"> Links </th> </tr> 1701 <tr><td>(f)irst change</td> </tr> 1702 <tr><td>(n)ext change</td> </tr> 1703 <tr><td>(t)op</td> </tr> 1704 </table></td> </tr> 1705 </table>""" 1706 1707class HtmlDiff(object): 1708 """For producing HTML side by side comparison with change highlights. 1709 1710 This class can be used to create an HTML table (or a complete HTML file 1711 containing the table) showing a side by side, line by line comparison 1712 of text with inter-line and intra-line change highlights. The table can 1713 be generated in either full or contextual difference mode. 1714 1715 The following methods are provided for HTML generation: 1716 1717 make_table -- generates HTML for a single side by side table 1718 make_file -- generates complete HTML file with a single side by side table 1719 1720 See tools/scripts/diff.py for an example usage of this class. 1721 """ 1722 1723 _file_template = _file_template 1724 _styles = _styles 1725 _table_template = _table_template 1726 _legend = _legend 1727 _default_prefix = 0 1728 1729 def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, 1730 charjunk=IS_CHARACTER_JUNK): 1731 """HtmlDiff instance initializer 1732 1733 Arguments: 1734 tabsize -- tab stop spacing, defaults to 8. 1735 wrapcolumn -- column number where lines are broken and wrapped, 1736 defaults to None where lines are not wrapped. 1737 linejunk,charjunk -- keyword arguments passed into ndiff() (used by 1738 HtmlDiff() to generate the side by side HTML differences). See 1739 ndiff() documentation for argument default values and descriptions. 1740 """ 1741 self._tabsize = tabsize 1742 self._wrapcolumn = wrapcolumn 1743 self._linejunk = linejunk 1744 self._charjunk = charjunk 1745 1746 def make_file(self, fromlines, tolines, fromdesc='', todesc='', 1747 context=False, numlines=5, *, charset='utf-8'): 1748 """Returns HTML file of side by side comparison with change highlights 1749 1750 Arguments: 1751 fromlines -- list of "from" lines 1752 tolines -- list of "to" lines 1753 fromdesc -- "from" file column header string 1754 todesc -- "to" file column header string 1755 context -- set to True for contextual differences (defaults to False 1756 which shows full differences). 1757 numlines -- number of context lines. When context is set True, 1758 controls number of lines displayed before and after the change. 1759 When context is False, controls the number of lines to place 1760 the "next" link anchors before the next change (so click of 1761 "next" link jumps to just before the change). 1762 charset -- charset of the HTML document 1763 """ 1764 1765 return (self._file_template % dict( 1766 styles=self._styles, 1767 legend=self._legend, 1768 table=self.make_table(fromlines, tolines, fromdesc, todesc, 1769 context=context, numlines=numlines), 1770 charset=charset 1771 )).encode(charset, 'xmlcharrefreplace').decode(charset) 1772 1773 def _tab_newline_replace(self,fromlines,tolines): 1774 """Returns from/to line lists with tabs expanded and newlines removed. 1775 1776 Instead of tab characters being replaced by the number of spaces 1777 needed to fill in to the next tab stop, this function will fill 1778 the space with tab characters. This is done so that the difference 1779 algorithms can identify changes in a file when tabs are replaced by 1780 spaces and vice versa. At the end of the HTML generation, the tab 1781 characters will be replaced with a nonbreakable space. 1782 """ 1783 def expand_tabs(line): 1784 # hide real spaces 1785 line = line.replace(' ','\0') 1786 # expand tabs into spaces 1787 line = line.expandtabs(self._tabsize) 1788 # replace spaces from expanded tabs back into tab characters 1789 # (we'll replace them with markup after we do differencing) 1790 line = line.replace(' ','\t') 1791 return line.replace('\0',' ').rstrip('\n') 1792 fromlines = [expand_tabs(line) for line in fromlines] 1793 tolines = [expand_tabs(line) for line in tolines] 1794 return fromlines,tolines 1795 1796 def _split_line(self,data_list,line_num,text): 1797 """Builds list of text lines by splitting text lines at wrap point 1798 1799 This function will determine if the input text line needs to be 1800 wrapped (split) into separate lines. If so, the first wrap point 1801 will be determined and the first line appended to the output 1802 text line list. This function is used recursively to handle 1803 the second part of the split line to further split it. 1804 """ 1805 # if blank line or context separator, just add it to the output list 1806 if not line_num: 1807 data_list.append((line_num,text)) 1808 return 1809 1810 # if line text doesn't need wrapping, just add it to the output list 1811 size = len(text) 1812 max = self._wrapcolumn 1813 if (size <= max) or ((size -(text.count('\0')*3)) <= max): 1814 data_list.append((line_num,text)) 1815 return 1816 1817 # scan text looking for the wrap point, keeping track if the wrap 1818 # point is inside markers 1819 i = 0 1820 n = 0 1821 mark = '' 1822 while n < max and i < size: 1823 if text[i] == '\0': 1824 i += 1 1825 mark = text[i] 1826 i += 1 1827 elif text[i] == '\1': 1828 i += 1 1829 mark = '' 1830 else: 1831 i += 1 1832 n += 1 1833 1834 # wrap point is inside text, break it up into separate lines 1835 line1 = text[:i] 1836 line2 = text[i:] 1837 1838 # if wrap point is inside markers, place end marker at end of first 1839 # line and start marker at beginning of second line because each 1840 # line will have its own table tag markup around it. 1841 if mark: 1842 line1 = line1 + '\1' 1843 line2 = '\0' + mark + line2 1844 1845 # tack on first line onto the output list 1846 data_list.append((line_num,line1)) 1847 1848 # use this routine again to wrap the remaining text 1849 self._split_line(data_list,'>',line2) 1850 1851 def _line_wrapper(self,diffs): 1852 """Returns iterator that splits (wraps) mdiff text lines""" 1853 1854 # pull from/to data and flags from mdiff iterator 1855 for fromdata,todata,flag in diffs: 1856 # check for context separators and pass them through 1857 if flag is None: 1858 yield fromdata,todata,flag 1859 continue 1860 (fromline,fromtext),(toline,totext) = fromdata,todata 1861 # for each from/to line split it at the wrap column to form 1862 # list of text lines. 1863 fromlist,tolist = [],[] 1864 self._split_line(fromlist,fromline,fromtext) 1865 self._split_line(tolist,toline,totext) 1866 # yield from/to line in pairs inserting blank lines as 1867 # necessary when one side has more wrapped lines 1868 while fromlist or tolist: 1869 if fromlist: 1870 fromdata = fromlist.pop(0) 1871 else: 1872 fromdata = ('',' ') 1873 if tolist: 1874 todata = tolist.pop(0) 1875 else: 1876 todata = ('',' ') 1877 yield fromdata,todata,flag 1878 1879 def _collect_lines(self,diffs): 1880 """Collects mdiff output into separate lists 1881 1882 Before storing the mdiff from/to data into a list, it is converted 1883 into a single line of text with HTML markup. 1884 """ 1885 1886 fromlist,tolist,flaglist = [],[],[] 1887 # pull from/to data and flags from mdiff style iterator 1888 for fromdata,todata,flag in diffs: 1889 try: 1890 # store HTML markup of the lines into the lists 1891 fromlist.append(self._format_line(0,flag,*fromdata)) 1892 tolist.append(self._format_line(1,flag,*todata)) 1893 except TypeError: 1894 # exceptions occur for lines where context separators go 1895 fromlist.append(None) 1896 tolist.append(None) 1897 flaglist.append(flag) 1898 return fromlist,tolist,flaglist 1899 1900 def _format_line(self,side,flag,linenum,text): 1901 """Returns HTML markup of "from" / "to" text lines 1902 1903 side -- 0 or 1 indicating "from" or "to" text 1904 flag -- indicates if difference on line 1905 linenum -- line number (used for line number column) 1906 text -- line text to be marked up 1907 """ 1908 try: 1909 linenum = '%d' % linenum 1910 id = ' id="%s%s"' % (self._prefix[side],linenum) 1911 except TypeError: 1912 # handle blank lines where linenum is '>' or '' 1913 id = '' 1914 # replace those things that would get confused with HTML symbols 1915 text=text.replace("&","&").replace(">",">").replace("<","<") 1916 1917 # make space non-breakable so they don't get compressed or line wrapped 1918 text = text.replace(' ',' ').rstrip() 1919 1920 return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \ 1921 % (id,linenum,text) 1922 1923 def _make_prefix(self): 1924 """Create unique anchor prefixes""" 1925 1926 # Generate a unique anchor prefix so multiple tables 1927 # can exist on the same HTML page without conflicts. 1928 fromprefix = "from%d_" % HtmlDiff._default_prefix 1929 toprefix = "to%d_" % HtmlDiff._default_prefix 1930 HtmlDiff._default_prefix += 1 1931 # store prefixes so line format method has access 1932 self._prefix = [fromprefix,toprefix] 1933 1934 def _convert_flags(self,fromlist,tolist,flaglist,context,numlines): 1935 """Makes list of "next" links""" 1936 1937 # all anchor names will be generated using the unique "to" prefix 1938 toprefix = self._prefix[1] 1939 1940 # process change flags, generating middle column of next anchors/links 1941 next_id = ['']*len(flaglist) 1942 next_href = ['']*len(flaglist) 1943 num_chg, in_change = 0, False 1944 last = 0 1945 for i,flag in enumerate(flaglist): 1946 if flag: 1947 if not in_change: 1948 in_change = True 1949 last = i 1950 # at the beginning of a change, drop an anchor a few lines 1951 # (the context lines) before the change for the previous 1952 # link 1953 i = max([0,i-numlines]) 1954 next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg) 1955 # at the beginning of a change, drop a link to the next 1956 # change 1957 num_chg += 1 1958 next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % ( 1959 toprefix,num_chg) 1960 else: 1961 in_change = False 1962 # check for cases where there is no content to avoid exceptions 1963 if not flaglist: 1964 flaglist = [False] 1965 next_id = [''] 1966 next_href = [''] 1967 last = 0 1968 if context: 1969 fromlist = ['<td></td><td> No Differences Found </td>'] 1970 tolist = fromlist 1971 else: 1972 fromlist = tolist = ['<td></td><td> Empty File </td>'] 1973 # if not a change on first line, drop a link 1974 if not flaglist[0]: 1975 next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix 1976 # redo the last link to link to the top 1977 next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix) 1978 1979 return fromlist,tolist,flaglist,next_href,next_id 1980 1981 def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, 1982 numlines=5): 1983 """Returns HTML table of side by side comparison with change highlights 1984 1985 Arguments: 1986 fromlines -- list of "from" lines 1987 tolines -- list of "to" lines 1988 fromdesc -- "from" file column header string 1989 todesc -- "to" file column header string 1990 context -- set to True for contextual differences (defaults to False 1991 which shows full differences). 1992 numlines -- number of context lines. When context is set True, 1993 controls number of lines displayed before and after the change. 1994 When context is False, controls the number of lines to place 1995 the "next" link anchors before the next change (so click of 1996 "next" link jumps to just before the change). 1997 """ 1998 1999 # make unique anchor prefixes so that multiple tables may exist 2000 # on the same page without conflict. 2001 self._make_prefix() 2002 2003 # change tabs to spaces before it gets more difficult after we insert 2004 # markup 2005 fromlines,tolines = self._tab_newline_replace(fromlines,tolines) 2006 2007 # create diffs iterator which generates side by side from/to data 2008 if context: 2009 context_lines = numlines 2010 else: 2011 context_lines = None 2012 diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, 2013 charjunk=self._charjunk) 2014 2015 # set up iterator to wrap lines that exceed desired width 2016 if self._wrapcolumn: 2017 diffs = self._line_wrapper(diffs) 2018 2019 # collect up from/to lines and flags into lists (also format the lines) 2020 fromlist,tolist,flaglist = self._collect_lines(diffs) 2021 2022 # process change flags, generating middle column of next anchors/links 2023 fromlist,tolist,flaglist,next_href,next_id = self._convert_flags( 2024 fromlist,tolist,flaglist,context,numlines) 2025 2026 s = [] 2027 fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \ 2028 '<td class="diff_next">%s</td>%s</tr>\n' 2029 for i in range(len(flaglist)): 2030 if flaglist[i] is None: 2031 # mdiff yields None on separator lines skip the bogus ones 2032 # generated for the first line 2033 if i > 0: 2034 s.append(' </tbody> \n <tbody>\n') 2035 else: 2036 s.append( fmt % (next_id[i],next_href[i],fromlist[i], 2037 next_href[i],tolist[i])) 2038 if fromdesc or todesc: 2039 header_row = '<thead><tr>%s%s%s%s</tr></thead>' % ( 2040 '<th class="diff_next"><br /></th>', 2041 '<th colspan="2" class="diff_header">%s</th>' % fromdesc, 2042 '<th class="diff_next"><br /></th>', 2043 '<th colspan="2" class="diff_header">%s</th>' % todesc) 2044 else: 2045 header_row = '' 2046 2047 table = self._table_template % dict( 2048 data_rows=''.join(s), 2049 header_row=header_row, 2050 prefix=self._prefix[1]) 2051 2052 return table.replace('\0+','<span class="diff_add">'). \ 2053 replace('\0-','<span class="diff_sub">'). \ 2054 replace('\0^','<span class="diff_chg">'). \ 2055 replace('\1','</span>'). \ 2056 replace('\t',' ') 2057 2058del re 2059 2060def restore(delta, which): 2061 r""" 2062 Generate one of the two sequences that generated a delta. 2063 2064 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract 2065 lines originating from file 1 or 2 (parameter `which`), stripping off line 2066 prefixes. 2067 2068 Examples: 2069 2070 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True), 2071 ... 'ore\ntree\nemu\n'.splitlines(keepends=True)) 2072 >>> diff = list(diff) 2073 >>> print(''.join(restore(diff, 1)), end="") 2074 one 2075 two 2076 three 2077 >>> print(''.join(restore(diff, 2)), end="") 2078 ore 2079 tree 2080 emu 2081 """ 2082 try: 2083 tag = {1: "- ", 2: "+ "}[int(which)] 2084 except KeyError: 2085 raise ValueError('unknown delta choice (must be 1 or 2): %r' 2086 % which) from None 2087 prefixes = (" ", tag) 2088 for line in delta: 2089 if line[:2] in prefixes: 2090 yield line[2:] 2091 2092def _test(): 2093 import doctest, difflib 2094 return doctest.testmod(difflib) 2095 2096if __name__ == "__main__": 2097 _test() 2098