1#!/usr/bin/env python
2
3from __future__ import print_function
4import sys, os, re, difflib, unicodedata, errno, cgi
5from itertools import *
6
7diff_symbols = "-+=*&^%$#@!~/"
8diff_colors = ['red', 'green', 'blue']
9
10try:
11	unichr = unichr
12
13	if sys.maxunicode < 0x10FFFF:
14		# workarounds for Python 2 "narrow" builds with UCS2-only support.
15
16		_narrow_unichr = unichr
17
18		def unichr(i):
19			"""
20			Return the unicode character whose Unicode code is the integer 'i'.
21			The valid range is 0 to 0x10FFFF inclusive.
22
23			>>> _narrow_unichr(0xFFFF + 1)
24			Traceback (most recent call last):
25			  File "<stdin>", line 1, in ?
26			ValueError: unichr() arg not in range(0x10000) (narrow Python build)
27			>>> unichr(0xFFFF + 1) == u'\U00010000'
28			True
29			>>> unichr(1114111) == u'\U0010FFFF'
30			True
31			>>> unichr(0x10FFFF + 1)
32			Traceback (most recent call last):
33			  File "<stdin>", line 1, in ?
34			ValueError: unichr() arg not in range(0x110000)
35			"""
36			try:
37				return _narrow_unichr(i)
38			except ValueError:
39				try:
40					padded_hex_str = hex(i)[2:].zfill(8)
41					escape_str = "\\U" + padded_hex_str
42					return escape_str.decode("unicode-escape")
43				except UnicodeDecodeError:
44					raise ValueError('unichr() arg not in range(0x110000)')
45
46except NameError:
47	unichr = chr
48
49class ColorFormatter:
50
51	class Null:
52		@staticmethod
53		def start_color (c): return ''
54		@staticmethod
55		def end_color (): return ''
56		@staticmethod
57		def escape (s): return s
58		@staticmethod
59		def newline (): return '\n'
60
61	class ANSI:
62		@staticmethod
63		def start_color (c):
64			return {
65				'red': '\033[41;37;1m',
66				'green': '\033[42;37;1m',
67				'blue': '\033[44;37;1m',
68			}[c]
69		@staticmethod
70		def end_color ():
71			return '\033[m'
72		@staticmethod
73		def escape (s): return s
74		@staticmethod
75		def newline (): return '\n'
76
77	class HTML:
78		@staticmethod
79		def start_color (c):
80			return '<span style="background:%s">' % c
81		@staticmethod
82		def end_color ():
83			return '</span>'
84		@staticmethod
85		def escape (s): return cgi.escape (s)
86		@staticmethod
87		def newline (): return '<br/>\n'
88
89	@staticmethod
90	def Auto (argv = [], out = sys.stdout):
91		format = ColorFormatter.ANSI
92		if "--format" in argv:
93			argv.remove ("--format")
94			format = ColorFormatter.ANSI
95		if "--format=ansi" in argv:
96			argv.remove ("--format=ansi")
97			format = ColorFormatter.ANSI
98		if "--format=html" in argv:
99			argv.remove ("--format=html")
100			format = ColorFormatter.HTML
101		if "--no-format" in argv:
102			argv.remove ("--no-format")
103			format = ColorFormatter.Null
104		return format
105
106
107class DiffColorizer:
108
109	diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
110
111	def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
112		self.formatter = formatter
113		self.colors = colors
114		self.symbols = symbols
115
116	def colorize_lines (self, lines):
117		lines = (l if l else '' for l in lines)
118		ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
119		oo = ["",""]
120		st = [False, False]
121		for l in difflib.Differ().compare (*ss):
122			if l[0] == '?':
123				continue
124			if l[0] == ' ':
125				for i in range(2):
126					if st[i]:
127						oo[i] += self.formatter.end_color ()
128						st[i] = False
129				oo = [o + self.formatter.escape (l[2:]) for o in oo]
130				continue
131			if l[0] in self.symbols:
132				i = self.symbols.index (l[0])
133				if not st[i]:
134					oo[i] += self.formatter.start_color (self.colors[i])
135					st[i] = True
136				oo[i] += self.formatter.escape (l[2:])
137				continue
138		for i in range(2):
139			if st[i]:
140				oo[i] += self.formatter.end_color ()
141				st[i] = False
142		oo = [o.replace ('\n', '') for o in oo]
143		return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
144
145	def colorize_diff (self, f):
146		lines = [None, None]
147		for l in f:
148			if l[0] not in self.symbols:
149				yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
150				continue
151			i = self.symbols.index (l[0])
152			if lines[i]:
153				# Flush
154				for line in self.colorize_lines (lines):
155					yield line
156				lines = [None, None]
157			lines[i] = l[1:]
158			if (all (lines)):
159				# Flush
160				for line in self.colorize_lines (lines):
161					yield line
162				lines = [None, None]
163		if (any (lines)):
164			# Flush
165			for line in self.colorize_lines (lines):
166				yield line
167
168
169class ZipDiffer:
170
171	@staticmethod
172	def diff_files (files, symbols=diff_symbols):
173		files = tuple (files) # in case it's a generator, copy it
174		try:
175			for lines in izip_longest (*files):
176				if all (lines[0] == line for line in lines[1:]):
177					sys.stdout.writelines ([" ", lines[0]])
178					continue
179
180				for i, l in enumerate (lines):
181					if l:
182						sys.stdout.writelines ([symbols[i], l])
183		except IOError as e:
184			if e.errno != errno.EPIPE:
185				print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
186				sys.exit (1)
187
188
189class DiffFilters:
190
191	@staticmethod
192	def filter_failures (f):
193		for key, lines in DiffHelpers.separate_test_cases (f):
194			lines = list (lines)
195			if not DiffHelpers.test_passed (lines):
196				for l in lines: yield l
197
198class Stat:
199
200	def __init__ (self):
201		self.count = 0
202		self.freq = 0
203
204	def add (self, test):
205		self.count += 1
206		self.freq += test.freq
207
208class Stats:
209
210	def __init__ (self):
211		self.passed = Stat ()
212		self.failed = Stat ()
213		self.total  = Stat ()
214
215	def add (self, test):
216		self.total.add (test)
217		if test.passed:
218			self.passed.add (test)
219		else:
220			self.failed.add (test)
221
222	def mean (self):
223		return float (self.passed.count) / self.total.count
224
225	def variance (self):
226		return (float (self.passed.count) / self.total.count) * \
227		       (float (self.failed.count) / self.total.count)
228
229	def stddev (self):
230		return self.variance () ** .5
231
232	def zscore (self, population):
233		"""Calculate the standard score.
234		   Population is the Stats for population.
235		   Self is Stats for sample.
236		   Returns larger absolute value if sample is highly unlikely to be random.
237		   Anything outside of -3..+3 is very unlikely to be random.
238		   See: http://en.wikipedia.org/wiki/Standard_score"""
239
240		return (self.mean () - population.mean ()) / population.stddev ()
241
242
243
244
245class DiffSinks:
246
247	@staticmethod
248	def print_stat (f):
249		passed = 0
250		failed = 0
251		# XXX port to Stats, but that would really slow us down here
252		for key, lines in DiffHelpers.separate_test_cases (f):
253			if DiffHelpers.test_passed (lines):
254				passed += 1
255			else:
256				failed += 1
257		total = passed + failed
258		print ("%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
259
260	@staticmethod
261	def print_ngrams (f, ns=(1,2,3)):
262		gens = tuple (Ngram.generator (n) for n in ns)
263		allstats = Stats ()
264		allgrams = {}
265		for key, lines in DiffHelpers.separate_test_cases (f):
266			test = Test (lines)
267			allstats.add (test)
268
269			for gen in gens:
270				for ngram in gen (test.unicodes):
271					if ngram not in allgrams:
272						allgrams[ngram] = Stats ()
273					allgrams[ngram].add (test)
274
275		importantgrams = {}
276		for ngram, stats in allgrams.iteritems ():
277			if stats.failed.count >= 30: # for statistical reasons
278				importantgrams[ngram] = stats
279		allgrams = importantgrams
280		del importantgrams
281
282		for ngram, stats in allgrams.iteritems ():
283			print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
284
285
286
287class Test:
288
289	def __init__ (self, lines):
290		self.freq = 1
291		self.passed = True
292		self.identifier = None
293		self.text = None
294		self.unicodes = None
295		self.glyphs = None
296		for l in lines:
297			symbol = l[0]
298			if symbol != ' ':
299				self.passed = False
300			i = 1
301			if ':' in l:
302				i = l.index (':')
303				if not self.identifier:
304					self.identifier = l[1:i]
305				i = i + 2 # Skip colon and space
306			j = -1
307			if l[j] == '\n':
308				j -= 1
309			brackets = l[i] + l[j]
310			l = l[i+1:-2]
311			if brackets == '()':
312				self.text = l
313			elif brackets == '<>':
314				self.unicodes = Unicode.parse (l)
315			elif brackets == '[]':
316				# XXX we don't handle failed tests here
317				self.glyphs = l
318
319
320class DiffHelpers:
321
322	@staticmethod
323	def separate_test_cases (f):
324		'''Reads lines from f, and if the lines have identifiers, ie.
325		   have a colon character, groups them by identifier,
326		   yielding lists of all lines with the same identifier.'''
327
328		def identifier (l):
329			if ':' in l[1:]:
330				return l[1:l.index (':')]
331			return l
332		return groupby (f, key=identifier)
333
334	@staticmethod
335	def test_passed (lines):
336		lines = list (lines)
337		# XXX This is a hack, but does the job for now.
338		if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
339		if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
340		if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
341		if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
342		if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
343		if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
344		return all (l[0] == ' ' for l in lines)
345
346
347class FilterHelpers:
348
349	@staticmethod
350	def filter_printer_function (filter_callback):
351		def printer (f):
352			for line in filter_callback (f):
353				print (line)
354		return printer
355
356	@staticmethod
357	def filter_printer_function_no_newline (filter_callback):
358		def printer (f):
359			for line in filter_callback (f):
360				sys.stdout.writelines ([line])
361		return printer
362
363
364class Ngram:
365
366	@staticmethod
367	def generator (n):
368
369		def gen (f):
370			l = []
371			for x in f:
372				l.append (x)
373				if len (l) == n:
374					yield tuple (l)
375					l[:1] = []
376
377		gen.n = n
378		return gen
379
380
381class UtilMains:
382
383	@staticmethod
384	def process_multiple_files (callback, mnemonic = "FILE"):
385
386		if "--help" in sys.argv:
387			print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
388			sys.exit (1)
389
390		try:
391			files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
392			for s in files:
393				callback (FileHelpers.open_file_or_stdin (s))
394		except IOError as e:
395			if e.errno != errno.EPIPE:
396				print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
397				sys.exit (1)
398
399	@staticmethod
400	def process_multiple_args (callback, mnemonic):
401
402		if len (sys.argv) == 1 or "--help" in sys.argv:
403			print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
404			sys.exit (1)
405
406		try:
407			for s in sys.argv[1:]:
408				callback (s)
409		except IOError as e:
410			if e.errno != errno.EPIPE:
411				print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
412				sys.exit (1)
413
414	@staticmethod
415	def filter_multiple_strings_or_stdin (callback, mnemonic, \
416					      separator = " ", \
417					      concat_separator = False):
418
419		if "--help" in sys.argv:
420			print ("Usage:\n  %s %s...\nor:\n  %s\n\nWhen called with no arguments, input is read from standard input." \
421			      % (sys.argv[0], mnemonic, sys.argv[0]))
422			sys.exit (1)
423
424		try:
425			if len (sys.argv) == 1:
426				while (1):
427					line = sys.stdin.readline ()
428					if not len (line):
429						break
430					if line[-1] == '\n':
431						line = line[:-1]
432					print (callback (line))
433			else:
434				args = sys.argv[1:]
435				if concat_separator != False:
436					args = [concat_separator.join (args)]
437				print (separator.join (callback (x) for x in (args)))
438		except IOError as e:
439			if e.errno != errno.EPIPE:
440				print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
441				sys.exit (1)
442
443
444class Unicode:
445
446	@staticmethod
447	def decode (s):
448		return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
449
450	@staticmethod
451	def parse (s):
452		s = re.sub (r"0[xX]", " ", s)
453		s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n	]", " ", s)
454		return [int (x, 16) for x in s.split ()]
455
456	@staticmethod
457	def encode (s):
458		s = u''.join (unichr (x) for x in Unicode.parse (s))
459		if sys.version_info[0] == 2: s = s.encode ('utf-8')
460		return s
461
462	shorthands = {
463		"ZERO WIDTH NON-JOINER": "ZWNJ",
464		"ZERO WIDTH JOINER": "ZWJ",
465		"NARROW NO-BREAK SPACE": "NNBSP",
466		"COMBINING GRAPHEME JOINER": "CGJ",
467		"LEFT-TO-RIGHT MARK": "LRM",
468		"RIGHT-TO-LEFT MARK": "RLM",
469		"LEFT-TO-RIGHT EMBEDDING": "LRE",
470		"RIGHT-TO-LEFT EMBEDDING": "RLE",
471		"POP DIRECTIONAL FORMATTING": "PDF",
472		"LEFT-TO-RIGHT OVERRIDE": "LRO",
473		"RIGHT-TO-LEFT OVERRIDE": "RLO",
474	}
475
476	@staticmethod
477	def pretty_name (u):
478		try:
479			s = unicodedata.name (u)
480		except ValueError:
481			return "XXX"
482		s = re.sub (".* LETTER ", "", s)
483		s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
484		s = re.sub (".* SIGN ", "", s)
485		s = re.sub (".* COMBINING ", "", s)
486		if re.match (".* VIRAMA", s):
487			s = "HALANT"
488		if s in Unicode.shorthands:
489			s = Unicode.shorthands[s]
490		return s
491
492	@staticmethod
493	def pretty_names (s):
494		s = re.sub (r"[<+>\\uU]", " ", s)
495		s = re.sub (r"0[xX]", " ", s)
496		s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
497		return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
498
499
500class FileHelpers:
501
502	@staticmethod
503	def open_file_or_stdin (f):
504		if f == '-':
505			return sys.stdin
506		return file (f)
507
508
509class Manifest:
510
511	@staticmethod
512	def read (s, strict = True):
513
514		if not os.path.exists (s):
515			if strict:
516				print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr)
517				sys.exit (1)
518			return
519
520		s = os.path.normpath (s)
521
522		if os.path.isdir (s):
523
524			try:
525				m = file (os.path.join (s, "MANIFEST"))
526				items = [x.strip () for x in m.readlines ()]
527				for f in items:
528					for p in Manifest.read (os.path.join (s, f)):
529						yield p
530			except IOError:
531				if strict:
532					print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr)
533					sys.exit (1)
534				return
535		else:
536			yield s
537
538	@staticmethod
539	def update_recursive (s):
540
541		for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
542
543			for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
544				if f in dirnames:
545					dirnames.remove (f)
546				if f in filenames:
547					filenames.remove (f)
548			dirnames.sort ()
549			filenames.sort ()
550			ms = os.path.join (dirpath, "MANIFEST")
551			print ("  GEN    %s" % ms)
552			m = open (ms, "w")
553			for f in filenames:
554				print (f, file=m)
555			for f in dirnames:
556				print (f, file=m)
557			for f in dirnames:
558				Manifest.update_recursive (os.path.join (dirpath, f))
559
560if __name__ == '__main__':
561	pass
562