1#!/usr/bin/python 2 3import sys, os, re, difflib, unicodedata, errno, cgi 4from itertools import * 5 6diff_symbols = "-+=*&^%$#@!~/" 7diff_colors = ['red', 'green', 'blue'] 8 9class ColorFormatter: 10 11 class Null: 12 @staticmethod 13 def start_color (c): return '' 14 @staticmethod 15 def end_color (): return '' 16 @staticmethod 17 def escape (s): return s 18 @staticmethod 19 def newline (): return '\n' 20 21 class ANSI: 22 @staticmethod 23 def start_color (c): 24 return { 25 'red': '\033[41;37;1m', 26 'green': '\033[42;37;1m', 27 'blue': '\033[44;37;1m', 28 }[c] 29 @staticmethod 30 def end_color (): 31 return '\033[m' 32 @staticmethod 33 def escape (s): return s 34 @staticmethod 35 def newline (): return '\n' 36 37 class HTML: 38 @staticmethod 39 def start_color (c): 40 return '<span style="background:%s">' % c 41 @staticmethod 42 def end_color (): 43 return '</span>' 44 @staticmethod 45 def escape (s): return cgi.escape (s) 46 @staticmethod 47 def newline (): return '<br/>\n' 48 49 @staticmethod 50 def Auto (argv = [], out = sys.stdout): 51 format = ColorFormatter.ANSI 52 if "--format" in argv: 53 argv.remove ("--format") 54 format = ColorFormatter.ANSI 55 if "--format=ansi" in argv: 56 argv.remove ("--format=ansi") 57 format = ColorFormatter.ANSI 58 if "--format=html" in argv: 59 argv.remove ("--format=html") 60 format = ColorFormatter.HTML 61 if "--no-format" in argv: 62 argv.remove ("--no-format") 63 format = ColorFormatter.Null 64 return format 65 66 67class DiffColorizer: 68 69 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)') 70 71 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols): 72 self.formatter = formatter 73 self.colors = colors 74 self.symbols = symbols 75 76 def colorize_lines (self, lines): 77 lines = (l if l else '' for l in lines) 78 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines] 79 oo = ["",""] 80 st = [False, False] 81 for l in difflib.Differ().compare (*ss): 82 if l[0] == '?': 83 continue 84 if l[0] == ' ': 85 for i in range(2): 86 if st[i]: 87 oo[i] += self.formatter.end_color () 88 st[i] = False 89 oo = [o + self.formatter.escape (l[2:]) for o in oo] 90 continue 91 if l[0] in self.symbols: 92 i = self.symbols.index (l[0]) 93 if not st[i]: 94 oo[i] += self.formatter.start_color (self.colors[i]) 95 st[i] = True 96 oo[i] += self.formatter.escape (l[2:]) 97 continue 98 for i in range(2): 99 if st[i]: 100 oo[i] += self.formatter.end_color () 101 st[i] = False 102 oo = [o.replace ('\n', '') for o in oo] 103 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2] 104 105 def colorize_diff (self, f): 106 lines = [None, None] 107 for l in f: 108 if l[0] not in self.symbols: 109 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ()) 110 continue 111 i = self.symbols.index (l[0]) 112 if lines[i]: 113 # Flush 114 for line in self.colorize_lines (lines): 115 yield line 116 lines = [None, None] 117 lines[i] = l[1:] 118 if (all (lines)): 119 # Flush 120 for line in self.colorize_lines (lines): 121 yield line 122 lines = [None, None] 123 if (any (lines)): 124 # Flush 125 for line in self.colorize_lines (lines): 126 yield line 127 128 129class ZipDiffer: 130 131 @staticmethod 132 def diff_files (files, symbols=diff_symbols): 133 files = tuple (files) # in case it's a generator, copy it 134 try: 135 for lines in izip_longest (*files): 136 if all (lines[0] == line for line in lines[1:]): 137 sys.stdout.writelines ([" ", lines[0]]) 138 continue 139 140 for i, l in enumerate (lines): 141 if l: 142 sys.stdout.writelines ([symbols[i], l]) 143 except IOError as e: 144 if e.errno != errno.EPIPE: 145 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror) 146 sys.exit (1) 147 148 149class DiffFilters: 150 151 @staticmethod 152 def filter_failures (f): 153 for key, lines in DiffHelpers.separate_test_cases (f): 154 lines = list (lines) 155 if not DiffHelpers.test_passed (lines): 156 for l in lines: yield l 157 158class Stat: 159 160 def __init__ (self): 161 self.count = 0 162 self.freq = 0 163 164 def add (self, test): 165 self.count += 1 166 self.freq += test.freq 167 168class Stats: 169 170 def __init__ (self): 171 self.passed = Stat () 172 self.failed = Stat () 173 self.total = Stat () 174 175 def add (self, test): 176 self.total.add (test) 177 if test.passed: 178 self.passed.add (test) 179 else: 180 self.failed.add (test) 181 182 def mean (self): 183 return float (self.passed.count) / self.total.count 184 185 def variance (self): 186 return (float (self.passed.count) / self.total.count) * \ 187 (float (self.failed.count) / self.total.count) 188 189 def stddev (self): 190 return self.variance () ** .5 191 192 def zscore (self, population): 193 """Calculate the standard score. 194 Population is the Stats for population. 195 Self is Stats for sample. 196 Returns larger absolute value if sample is highly unlikely to be random. 197 Anything outside of -3..+3 is very unlikely to be random. 198 See: http://en.wikipedia.org/wiki/Standard_score""" 199 200 return (self.mean () - population.mean ()) / population.stddev () 201 202 203 204 205class DiffSinks: 206 207 @staticmethod 208 def print_stat (f): 209 passed = 0 210 failed = 0 211 # XXX port to Stats, but that would really slow us down here 212 for key, lines in DiffHelpers.separate_test_cases (f): 213 if DiffHelpers.test_passed (lines): 214 passed += 1 215 else: 216 failed += 1 217 total = passed + failed 218 print "%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total) 219 220 @staticmethod 221 def print_ngrams (f, ns=(1,2,3)): 222 gens = tuple (Ngram.generator (n) for n in ns) 223 allstats = Stats () 224 allgrams = {} 225 for key, lines in DiffHelpers.separate_test_cases (f): 226 test = Test (lines) 227 allstats.add (test) 228 229 for gen in gens: 230 for ngram in gen (test.unicodes): 231 if ngram not in allgrams: 232 allgrams[ngram] = Stats () 233 allgrams[ngram].add (test) 234 235 importantgrams = {} 236 for ngram, stats in allgrams.iteritems (): 237 if stats.failed.count >= 30: # for statistical reasons 238 importantgrams[ngram] = stats 239 allgrams = importantgrams 240 del importantgrams 241 242 for ngram, stats in allgrams.iteritems (): 243 print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)) 244 245 246 247class Test: 248 249 def __init__ (self, lines): 250 self.freq = 1 251 self.passed = True 252 self.identifier = None 253 self.text = None 254 self.unicodes = None 255 self.glyphs = None 256 for l in lines: 257 symbol = l[0] 258 if symbol != ' ': 259 self.passed = False 260 i = 1 261 if ':' in l: 262 i = l.index (':') 263 if not self.identifier: 264 self.identifier = l[1:i] 265 i = i + 2 # Skip colon and space 266 j = -1 267 if l[j] == '\n': 268 j -= 1 269 brackets = l[i] + l[j] 270 l = l[i+1:-2] 271 if brackets == '()': 272 self.text = l 273 elif brackets == '<>': 274 self.unicodes = Unicode.parse (l) 275 elif brackets == '[]': 276 # XXX we don't handle failed tests here 277 self.glyphs = l 278 279 280class DiffHelpers: 281 282 @staticmethod 283 def separate_test_cases (f): 284 '''Reads lines from f, and if the lines have identifiers, ie. 285 have a colon character, groups them by identifier, 286 yielding lists of all lines with the same identifier.''' 287 288 def identifier (l): 289 if ':' in l[1:]: 290 return l[1:l.index (':')] 291 return l 292 return groupby (f, key=identifier) 293 294 @staticmethod 295 def test_passed (lines): 296 lines = list (lines) 297 # XXX This is a hack, but does the job for now. 298 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True 299 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True 300 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True 301 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True 302 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True 303 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True 304 return all (l[0] == ' ' for l in lines) 305 306 307class FilterHelpers: 308 309 @staticmethod 310 def filter_printer_function (filter_callback): 311 def printer (f): 312 for line in filter_callback (f): 313 print line 314 return printer 315 316 @staticmethod 317 def filter_printer_function_no_newline (filter_callback): 318 def printer (f): 319 for line in filter_callback (f): 320 sys.stdout.writelines ([line]) 321 return printer 322 323 324class Ngram: 325 326 @staticmethod 327 def generator (n): 328 329 def gen (f): 330 l = [] 331 for x in f: 332 l.append (x) 333 if len (l) == n: 334 yield tuple (l) 335 l[:1] = [] 336 337 gen.n = n 338 return gen 339 340 341class UtilMains: 342 343 @staticmethod 344 def process_multiple_files (callback, mnemonic = "FILE"): 345 346 if "--help" in sys.argv: 347 print "Usage: %s %s..." % (sys.argv[0], mnemonic) 348 sys.exit (1) 349 350 try: 351 files = sys.argv[1:] if len (sys.argv) > 1 else ['-'] 352 for s in files: 353 callback (FileHelpers.open_file_or_stdin (s)) 354 except IOError as e: 355 if e.errno != errno.EPIPE: 356 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror) 357 sys.exit (1) 358 359 @staticmethod 360 def process_multiple_args (callback, mnemonic): 361 362 if len (sys.argv) == 1 or "--help" in sys.argv: 363 print "Usage: %s %s..." % (sys.argv[0], mnemonic) 364 sys.exit (1) 365 366 try: 367 for s in sys.argv[1:]: 368 callback (s) 369 except IOError as e: 370 if e.errno != errno.EPIPE: 371 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror) 372 sys.exit (1) 373 374 @staticmethod 375 def filter_multiple_strings_or_stdin (callback, mnemonic, \ 376 separator = " ", \ 377 concat_separator = False): 378 379 if "--help" in sys.argv: 380 print "Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \ 381 % (sys.argv[0], mnemonic, sys.argv[0]) 382 sys.exit (1) 383 384 try: 385 if len (sys.argv) == 1: 386 while (1): 387 line = sys.stdin.readline () 388 if not len (line): 389 break 390 if line[-1] == '\n': 391 line = line[:-1] 392 print callback (line) 393 else: 394 args = sys.argv[1:] 395 if concat_separator != False: 396 args = [concat_separator.join (args)] 397 print separator.join (callback (x) for x in (args)) 398 except IOError as e: 399 if e.errno != errno.EPIPE: 400 print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror) 401 sys.exit (1) 402 403 404class Unicode: 405 406 @staticmethod 407 def decode (s): 408 return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') 409 410 @staticmethod 411 def parse (s): 412 s = re.sub (r"0[xX]", " ", s) 413 s = re.sub (r"[<+>,;&#\\xXuU\n ]", " ", s) 414 return [int (x, 16) for x in s.split ()] 415 416 @staticmethod 417 def encode (s): 418 return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8') 419 420 shorthands = { 421 "ZERO WIDTH NON-JOINER": "ZWNJ", 422 "ZERO WIDTH JOINER": "ZWJ", 423 "NARROW NO-BREAK SPACE": "NNBSP", 424 "COMBINING GRAPHEME JOINER": "CGJ", 425 "LEFT-TO-RIGHT MARK": "LRM", 426 "RIGHT-TO-LEFT MARK": "RLM", 427 "LEFT-TO-RIGHT EMBEDDING": "LRE", 428 "RIGHT-TO-LEFT EMBEDDING": "RLE", 429 "POP DIRECTIONAL FORMATTING": "PDF", 430 "LEFT-TO-RIGHT OVERRIDE": "LRO", 431 "RIGHT-TO-LEFT OVERRIDE": "RLO", 432 } 433 434 @staticmethod 435 def pretty_name (u): 436 try: 437 s = unicodedata.name (u) 438 except ValueError: 439 return "XXX" 440 s = re.sub (".* LETTER ", "", s) 441 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s) 442 s = re.sub (".* SIGN ", "", s) 443 s = re.sub (".* COMBINING ", "", s) 444 if re.match (".* VIRAMA", s): 445 s = "HALANT" 446 if s in Unicode.shorthands: 447 s = Unicode.shorthands[s] 448 return s 449 450 @staticmethod 451 def pretty_names (s): 452 s = re.sub (r"[<+>\\uU]", " ", s) 453 s = re.sub (r"0[xX]", " ", s) 454 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)] 455 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8') 456 457 458class FileHelpers: 459 460 @staticmethod 461 def open_file_or_stdin (f): 462 if f == '-': 463 return sys.stdin 464 return file (f) 465 466 467class Manifest: 468 469 @staticmethod 470 def read (s, strict = True): 471 472 if not os.path.exists (s): 473 if strict: 474 print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s) 475 sys.exit (1) 476 return 477 478 s = os.path.normpath (s) 479 480 if os.path.isdir (s): 481 482 try: 483 m = file (os.path.join (s, "MANIFEST")) 484 items = [x.strip () for x in m.readlines ()] 485 for f in items: 486 for p in Manifest.read (os.path.join (s, f)): 487 yield p 488 except IOError: 489 if strict: 490 print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")) 491 sys.exit (1) 492 return 493 else: 494 yield s 495 496 @staticmethod 497 def update_recursive (s): 498 499 for dirpath, dirnames, filenames in os.walk (s, followlinks=True): 500 501 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]: 502 if f in dirnames: 503 dirnames.remove (f) 504 if f in filenames: 505 filenames.remove (f) 506 dirnames.sort () 507 filenames.sort () 508 ms = os.path.join (dirpath, "MANIFEST") 509 print " GEN %s" % ms 510 m = open (ms, "w") 511 for f in filenames: 512 print >> m, f 513 for f in dirnames: 514 print >> m, f 515 for f in dirnames: 516 Manifest.update_recursive (os.path.join (dirpath, f)) 517 518if __name__ == '__main__': 519 pass 520