1#!/usr/bin/env python 2 3from __future__ import print_function 4import sys, os, re, difflib, unicodedata, errno, cgi 5from itertools import * 6 7diff_symbols = "-+=*&^%$#@!~/" 8diff_colors = ['red', 'green', 'blue'] 9 10try: 11 unichr = unichr 12 13 if sys.maxunicode < 0x10FFFF: 14 # workarounds for Python 2 "narrow" builds with UCS2-only support. 15 16 _narrow_unichr = unichr 17 18 def unichr(i): 19 """ 20 Return the unicode character whose Unicode code is the integer 'i'. 21 The valid range is 0 to 0x10FFFF inclusive. 22 23 >>> _narrow_unichr(0xFFFF + 1) 24 Traceback (most recent call last): 25 File "<stdin>", line 1, in ? 26 ValueError: unichr() arg not in range(0x10000) (narrow Python build) 27 >>> unichr(0xFFFF + 1) == u'\U00010000' 28 True 29 >>> unichr(1114111) == u'\U0010FFFF' 30 True 31 >>> unichr(0x10FFFF + 1) 32 Traceback (most recent call last): 33 File "<stdin>", line 1, in ? 34 ValueError: unichr() arg not in range(0x110000) 35 """ 36 try: 37 return _narrow_unichr(i) 38 except ValueError: 39 try: 40 padded_hex_str = hex(i)[2:].zfill(8) 41 escape_str = "\\U" + padded_hex_str 42 return escape_str.decode("unicode-escape") 43 except UnicodeDecodeError: 44 raise ValueError('unichr() arg not in range(0x110000)') 45 46except NameError: 47 unichr = chr 48 49class ColorFormatter: 50 51 class Null: 52 @staticmethod 53 def start_color (c): return '' 54 @staticmethod 55 def end_color (): return '' 56 @staticmethod 57 def escape (s): return s 58 @staticmethod 59 def newline (): return '\n' 60 61 class ANSI: 62 @staticmethod 63 def start_color (c): 64 return { 65 'red': '\033[41;37;1m', 66 'green': '\033[42;37;1m', 67 'blue': '\033[44;37;1m', 68 }[c] 69 @staticmethod 70 def end_color (): 71 return '\033[m' 72 @staticmethod 73 def escape (s): return s 74 @staticmethod 75 def newline (): return '\n' 76 77 class HTML: 78 @staticmethod 79 def start_color (c): 80 return '<span style="background:%s">' % c 81 @staticmethod 82 def end_color (): 83 return '</span>' 84 @staticmethod 85 def escape (s): return cgi.escape (s) 86 @staticmethod 87 def newline (): return '<br/>\n' 88 89 @staticmethod 90 def Auto (argv = [], out = sys.stdout): 91 format = ColorFormatter.ANSI 92 if "--format" in argv: 93 argv.remove ("--format") 94 format = ColorFormatter.ANSI 95 if "--format=ansi" in argv: 96 argv.remove ("--format=ansi") 97 format = ColorFormatter.ANSI 98 if "--format=html" in argv: 99 argv.remove ("--format=html") 100 format = ColorFormatter.HTML 101 if "--no-format" in argv: 102 argv.remove ("--no-format") 103 format = ColorFormatter.Null 104 return format 105 106 107class DiffColorizer: 108 109 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)') 110 111 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols): 112 self.formatter = formatter 113 self.colors = colors 114 self.symbols = symbols 115 116 def colorize_lines (self, lines): 117 lines = (l if l else '' for l in lines) 118 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines] 119 oo = ["",""] 120 st = [False, False] 121 for l in difflib.Differ().compare (*ss): 122 if l[0] == '?': 123 continue 124 if l[0] == ' ': 125 for i in range(2): 126 if st[i]: 127 oo[i] += self.formatter.end_color () 128 st[i] = False 129 oo = [o + self.formatter.escape (l[2:]) for o in oo] 130 continue 131 if l[0] in self.symbols: 132 i = self.symbols.index (l[0]) 133 if not st[i]: 134 oo[i] += self.formatter.start_color (self.colors[i]) 135 st[i] = True 136 oo[i] += self.formatter.escape (l[2:]) 137 continue 138 for i in range(2): 139 if st[i]: 140 oo[i] += self.formatter.end_color () 141 st[i] = False 142 oo = [o.replace ('\n', '') for o in oo] 143 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2] 144 145 def colorize_diff (self, f): 146 lines = [None, None] 147 for l in f: 148 if l[0] not in self.symbols: 149 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ()) 150 continue 151 i = self.symbols.index (l[0]) 152 if lines[i]: 153 # Flush 154 for line in self.colorize_lines (lines): 155 yield line 156 lines = [None, None] 157 lines[i] = l[1:] 158 if (all (lines)): 159 # Flush 160 for line in self.colorize_lines (lines): 161 yield line 162 lines = [None, None] 163 if (any (lines)): 164 # Flush 165 for line in self.colorize_lines (lines): 166 yield line 167 168 169class ZipDiffer: 170 171 @staticmethod 172 def diff_files (files, symbols=diff_symbols): 173 files = tuple (files) # in case it's a generator, copy it 174 try: 175 for lines in izip_longest (*files): 176 if all (lines[0] == line for line in lines[1:]): 177 sys.stdout.writelines ([" ", lines[0]]) 178 continue 179 180 for i, l in enumerate (lines): 181 if l: 182 sys.stdout.writelines ([symbols[i], l]) 183 except IOError as e: 184 if e.errno != errno.EPIPE: 185 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 186 sys.exit (1) 187 188 189class DiffFilters: 190 191 @staticmethod 192 def filter_failures (f): 193 for key, lines in DiffHelpers.separate_test_cases (f): 194 lines = list (lines) 195 if not DiffHelpers.test_passed (lines): 196 for l in lines: yield l 197 198class Stat: 199 200 def __init__ (self): 201 self.count = 0 202 self.freq = 0 203 204 def add (self, test): 205 self.count += 1 206 self.freq += test.freq 207 208class Stats: 209 210 def __init__ (self): 211 self.passed = Stat () 212 self.failed = Stat () 213 self.total = Stat () 214 215 def add (self, test): 216 self.total.add (test) 217 if test.passed: 218 self.passed.add (test) 219 else: 220 self.failed.add (test) 221 222 def mean (self): 223 return float (self.passed.count) / self.total.count 224 225 def variance (self): 226 return (float (self.passed.count) / self.total.count) * \ 227 (float (self.failed.count) / self.total.count) 228 229 def stddev (self): 230 return self.variance () ** .5 231 232 def zscore (self, population): 233 """Calculate the standard score. 234 Population is the Stats for population. 235 Self is Stats for sample. 236 Returns larger absolute value if sample is highly unlikely to be random. 237 Anything outside of -3..+3 is very unlikely to be random. 238 See: http://en.wikipedia.org/wiki/Standard_score""" 239 240 return (self.mean () - population.mean ()) / population.stddev () 241 242 243 244 245class DiffSinks: 246 247 @staticmethod 248 def print_stat (f): 249 passed = 0 250 failed = 0 251 # XXX port to Stats, but that would really slow us down here 252 for key, lines in DiffHelpers.separate_test_cases (f): 253 if DiffHelpers.test_passed (lines): 254 passed += 1 255 else: 256 failed += 1 257 total = passed + failed 258 print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)) 259 260 @staticmethod 261 def print_ngrams (f, ns=(1,2,3)): 262 gens = tuple (Ngram.generator (n) for n in ns) 263 allstats = Stats () 264 allgrams = {} 265 for key, lines in DiffHelpers.separate_test_cases (f): 266 test = Test (lines) 267 allstats.add (test) 268 269 for gen in gens: 270 for ngram in gen (test.unicodes): 271 if ngram not in allgrams: 272 allgrams[ngram] = Stats () 273 allgrams[ngram].add (test) 274 275 importantgrams = {} 276 for ngram, stats in allgrams.iteritems (): 277 if stats.failed.count >= 30: # for statistical reasons 278 importantgrams[ngram] = stats 279 allgrams = importantgrams 280 del importantgrams 281 282 for ngram, stats in allgrams.iteritems (): 283 print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))) 284 285 286 287class Test: 288 289 def __init__ (self, lines): 290 self.freq = 1 291 self.passed = True 292 self.identifier = None 293 self.text = None 294 self.unicodes = None 295 self.glyphs = None 296 for l in lines: 297 symbol = l[0] 298 if symbol != ' ': 299 self.passed = False 300 i = 1 301 if ':' in l: 302 i = l.index (':') 303 if not self.identifier: 304 self.identifier = l[1:i] 305 i = i + 2 # Skip colon and space 306 j = -1 307 if l[j] == '\n': 308 j -= 1 309 brackets = l[i] + l[j] 310 l = l[i+1:-2] 311 if brackets == '()': 312 self.text = l 313 elif brackets == '<>': 314 self.unicodes = Unicode.parse (l) 315 elif brackets == '[]': 316 # XXX we don't handle failed tests here 317 self.glyphs = l 318 319 320class DiffHelpers: 321 322 @staticmethod 323 def separate_test_cases (f): 324 '''Reads lines from f, and if the lines have identifiers, ie. 325 have a colon character, groups them by identifier, 326 yielding lists of all lines with the same identifier.''' 327 328 def identifier (l): 329 if ':' in l[1:]: 330 return l[1:l.index (':')] 331 return l 332 return groupby (f, key=identifier) 333 334 @staticmethod 335 def test_passed (lines): 336 lines = list (lines) 337 # XXX This is a hack, but does the job for now. 338 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True 339 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True 340 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True 341 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True 342 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True 343 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True 344 return all (l[0] == ' ' for l in lines) 345 346 347class FilterHelpers: 348 349 @staticmethod 350 def filter_printer_function (filter_callback): 351 def printer (f): 352 for line in filter_callback (f): 353 print (line) 354 return printer 355 356 @staticmethod 357 def filter_printer_function_no_newline (filter_callback): 358 def printer (f): 359 for line in filter_callback (f): 360 sys.stdout.writelines ([line]) 361 return printer 362 363 364class Ngram: 365 366 @staticmethod 367 def generator (n): 368 369 def gen (f): 370 l = [] 371 for x in f: 372 l.append (x) 373 if len (l) == n: 374 yield tuple (l) 375 l[:1] = [] 376 377 gen.n = n 378 return gen 379 380 381class UtilMains: 382 383 @staticmethod 384 def process_multiple_files (callback, mnemonic = "FILE"): 385 386 if "--help" in sys.argv: 387 print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) 388 sys.exit (1) 389 390 try: 391 files = sys.argv[1:] if len (sys.argv) > 1 else ['-'] 392 for s in files: 393 callback (FileHelpers.open_file_or_stdin (s)) 394 except IOError as e: 395 if e.errno != errno.EPIPE: 396 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 397 sys.exit (1) 398 399 @staticmethod 400 def process_multiple_args (callback, mnemonic): 401 402 if len (sys.argv) == 1 or "--help" in sys.argv: 403 print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) 404 sys.exit (1) 405 406 try: 407 for s in sys.argv[1:]: 408 callback (s) 409 except IOError as e: 410 if e.errno != errno.EPIPE: 411 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 412 sys.exit (1) 413 414 @staticmethod 415 def filter_multiple_strings_or_stdin (callback, mnemonic, \ 416 separator = " ", \ 417 concat_separator = False): 418 419 if "--help" in sys.argv: 420 print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \ 421 % (sys.argv[0], mnemonic, sys.argv[0])) 422 sys.exit (1) 423 424 try: 425 if len (sys.argv) == 1: 426 while (1): 427 line = sys.stdin.readline () 428 if not len (line): 429 break 430 if line[-1] == '\n': 431 line = line[:-1] 432 print (callback (line)) 433 else: 434 args = sys.argv[1:] 435 if concat_separator != False: 436 args = [concat_separator.join (args)] 437 print (separator.join (callback (x) for x in (args))) 438 except IOError as e: 439 if e.errno != errno.EPIPE: 440 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 441 sys.exit (1) 442 443 444class Unicode: 445 446 @staticmethod 447 def decode (s): 448 return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') 449 450 @staticmethod 451 def parse (s): 452 s = re.sub (r"0[xX]", " ", s) 453 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s) 454 return [int (x, 16) for x in s.split ()] 455 456 @staticmethod 457 def encode (s): 458 s = u''.join (unichr (x) for x in Unicode.parse (s)) 459 if sys.version_info[0] == 2: s = s.encode ('utf-8') 460 return s 461 462 shorthands = { 463 "ZERO WIDTH NON-JOINER": "ZWNJ", 464 "ZERO WIDTH JOINER": "ZWJ", 465 "NARROW NO-BREAK SPACE": "NNBSP", 466 "COMBINING GRAPHEME JOINER": "CGJ", 467 "LEFT-TO-RIGHT MARK": "LRM", 468 "RIGHT-TO-LEFT MARK": "RLM", 469 "LEFT-TO-RIGHT EMBEDDING": "LRE", 470 "RIGHT-TO-LEFT EMBEDDING": "RLE", 471 "POP DIRECTIONAL FORMATTING": "PDF", 472 "LEFT-TO-RIGHT OVERRIDE": "LRO", 473 "RIGHT-TO-LEFT OVERRIDE": "RLO", 474 } 475 476 @staticmethod 477 def pretty_name (u): 478 try: 479 s = unicodedata.name (u) 480 except ValueError: 481 return "XXX" 482 s = re.sub (".* LETTER ", "", s) 483 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s) 484 s = re.sub (".* SIGN ", "", s) 485 s = re.sub (".* COMBINING ", "", s) 486 if re.match (".* VIRAMA", s): 487 s = "HALANT" 488 if s in Unicode.shorthands: 489 s = Unicode.shorthands[s] 490 return s 491 492 @staticmethod 493 def pretty_names (s): 494 s = re.sub (r"[<+>\\uU]", " ", s) 495 s = re.sub (r"0[xX]", " ", s) 496 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)] 497 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8') 498 499 500class FileHelpers: 501 502 @staticmethod 503 def open_file_or_stdin (f): 504 if f == '-': 505 return sys.stdin 506 return file (f) 507 508 509class Manifest: 510 511 @staticmethod 512 def read (s, strict = True): 513 514 if not os.path.exists (s): 515 if strict: 516 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr) 517 sys.exit (1) 518 return 519 520 s = os.path.normpath (s) 521 522 if os.path.isdir (s): 523 524 try: 525 m = file (os.path.join (s, "MANIFEST")) 526 items = [x.strip () for x in m.readlines ()] 527 for f in items: 528 for p in Manifest.read (os.path.join (s, f)): 529 yield p 530 except IOError: 531 if strict: 532 print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr) 533 sys.exit (1) 534 return 535 else: 536 yield s 537 538 @staticmethod 539 def update_recursive (s): 540 541 for dirpath, dirnames, filenames in os.walk (s, followlinks=True): 542 543 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]: 544 if f in dirnames: 545 dirnames.remove (f) 546 if f in filenames: 547 filenames.remove (f) 548 dirnames.sort () 549 filenames.sort () 550 ms = os.path.join (dirpath, "MANIFEST") 551 print (" GEN %s" % ms) 552 m = open (ms, "w") 553 for f in filenames: 554 print (f, file=m) 555 for f in dirnames: 556 print (f, file=m) 557 for f in dirnames: 558 Manifest.update_recursive (os.path.join (dirpath, f)) 559 560if __name__ == '__main__': 561 pass 562