1#!/usr/bin/python
2
3import sys, os, re, difflib, unicodedata, errno, cgi
4from itertools import *
5
6diff_symbols = "-+=*&^%$#@!~/"
7diff_colors = ['red', 'green', 'blue']
8
9class ColorFormatter:
10
11	class Null:
12		@staticmethod
13		def start_color (c): return ''
14		@staticmethod
15		def end_color (): return ''
16		@staticmethod
17		def escape (s): return s
18		@staticmethod
19		def newline (): return '\n'
20
21	class ANSI:
22		@staticmethod
23		def start_color (c):
24			return {
25				'red': '\033[41;37;1m',
26				'green': '\033[42;37;1m',
27				'blue': '\033[44;37;1m',
28			}[c]
29		@staticmethod
30		def end_color ():
31			return '\033[m'
32		@staticmethod
33		def escape (s): return s
34		@staticmethod
35		def newline (): return '\n'
36
37	class HTML:
38		@staticmethod
39		def start_color (c):
40			return '<span style="background:%s">' % c
41		@staticmethod
42		def end_color ():
43			return '</span>'
44		@staticmethod
45		def escape (s): return cgi.escape (s)
46		@staticmethod
47		def newline (): return '<br/>\n'
48
49	@staticmethod
50	def Auto (argv = [], out = sys.stdout):
51		format = ColorFormatter.ANSI
52		if "--format" in argv:
53			argv.remove ("--format")
54			format = ColorFormatter.ANSI
55		if "--format=ansi" in argv:
56			argv.remove ("--format=ansi")
57			format = ColorFormatter.ANSI
58		if "--format=html" in argv:
59			argv.remove ("--format=html")
60			format = ColorFormatter.HTML
61		if "--no-format" in argv:
62			argv.remove ("--no-format")
63			format = ColorFormatter.Null
64		return format
65
66
67class DiffColorizer:
68
69	diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
70
71	def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
72		self.formatter = formatter
73		self.colors = colors
74		self.symbols = symbols
75
76	def colorize_lines (self, lines):
77		lines = (l if l else '' for l in lines)
78		ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
79		oo = ["",""]
80		st = [False, False]
81		for l in difflib.Differ().compare (*ss):
82			if l[0] == '?':
83				continue
84			if l[0] == ' ':
85				for i in range(2):
86					if st[i]:
87						oo[i] += self.formatter.end_color ()
88						st[i] = False
89				oo = [o + self.formatter.escape (l[2:]) for o in oo]
90				continue
91			if l[0] in self.symbols:
92				i = self.symbols.index (l[0])
93				if not st[i]:
94					oo[i] += self.formatter.start_color (self.colors[i])
95					st[i] = True
96				oo[i] += self.formatter.escape (l[2:])
97				continue
98		for i in range(2):
99			if st[i]:
100				oo[i] += self.formatter.end_color ()
101				st[i] = False
102		oo = [o.replace ('\n', '') for o in oo]
103		return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
104
105	def colorize_diff (self, f):
106		lines = [None, None]
107		for l in f:
108			if l[0] not in self.symbols:
109				yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
110				continue
111			i = self.symbols.index (l[0])
112			if lines[i]:
113				# Flush
114				for line in self.colorize_lines (lines):
115					yield line
116				lines = [None, None]
117			lines[i] = l[1:]
118			if (all (lines)):
119				# Flush
120				for line in self.colorize_lines (lines):
121					yield line
122				lines = [None, None]
123		if (any (lines)):
124			# Flush
125			for line in self.colorize_lines (lines):
126				yield line
127
128
129class ZipDiffer:
130
131	@staticmethod
132	def diff_files (files, symbols=diff_symbols):
133		files = tuple (files) # in case it's a generator, copy it
134		try:
135			for lines in izip_longest (*files):
136				if all (lines[0] == line for line in lines[1:]):
137					sys.stdout.writelines ([" ", lines[0]])
138					continue
139
140				for i, l in enumerate (lines):
141					if l:
142						sys.stdout.writelines ([symbols[i], l])
143		except IOError as e:
144			if e.errno != errno.EPIPE:
145				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
146				sys.exit (1)
147
148
149class DiffFilters:
150
151	@staticmethod
152	def filter_failures (f):
153		for key, lines in DiffHelpers.separate_test_cases (f):
154			lines = list (lines)
155			if not DiffHelpers.test_passed (lines):
156				for l in lines: yield l
157
158class Stat:
159
160	def __init__ (self):
161		self.count = 0
162		self.freq = 0
163
164	def add (self, test):
165		self.count += 1
166		self.freq += test.freq
167
168class Stats:
169
170	def __init__ (self):
171		self.passed = Stat ()
172		self.failed = Stat ()
173		self.total  = Stat ()
174
175	def add (self, test):
176		self.total.add (test)
177		if test.passed:
178			self.passed.add (test)
179		else:
180			self.failed.add (test)
181
182	def mean (self):
183		return float (self.passed.count) / self.total.count
184
185	def variance (self):
186		return (float (self.passed.count) / self.total.count) * \
187		       (float (self.failed.count) / self.total.count)
188
189	def stddev (self):
190		return self.variance () ** .5
191
192	def zscore (self, population):
193		"""Calculate the standard score.
194		   Population is the Stats for population.
195		   Self is Stats for sample.
196		   Returns larger absolute value if sample is highly unlikely to be random.
197		   Anything outside of -3..+3 is very unlikely to be random.
198		   See: http://en.wikipedia.org/wiki/Standard_score"""
199
200		return (self.mean () - population.mean ()) / population.stddev ()
201
202
203
204
205class DiffSinks:
206
207	@staticmethod
208	def print_stat (f):
209		passed = 0
210		failed = 0
211		# XXX port to Stats, but that would really slow us down here
212		for key, lines in DiffHelpers.separate_test_cases (f):
213			if DiffHelpers.test_passed (lines):
214				passed += 1
215			else:
216				failed += 1
217		total = passed + failed
218		print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
219
220	@staticmethod
221	def print_ngrams (f, ns=(1,2,3)):
222		gens = tuple (Ngram.generator (n) for n in ns)
223		allstats = Stats ()
224		allgrams = {}
225		for key, lines in DiffHelpers.separate_test_cases (f):
226			test = Test (lines)
227			allstats.add (test)
228
229			for gen in gens:
230				for ngram in gen (test.unicodes):
231					if ngram not in allgrams:
232						allgrams[ngram] = Stats ()
233					allgrams[ngram].add (test)
234
235		importantgrams = {}
236		for ngram, stats in allgrams.iteritems ():
237			if stats.failed.count >= 30: # for statistical reasons
238				importantgrams[ngram] = stats
239		allgrams = importantgrams
240		del importantgrams
241
242		for ngram, stats in allgrams.iteritems ():
243			print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
244
245
246
247class Test:
248
249	def __init__ (self, lines):
250		self.freq = 1
251		self.passed = True
252		self.identifier = None
253		self.text = None
254		self.unicodes = None
255		self.glyphs = None
256		for l in lines:
257			symbol = l[0]
258			if symbol != ' ':
259				self.passed = False
260			i = 1
261			if ':' in l:
262				i = l.index (':')
263				if not self.identifier:
264					self.identifier = l[1:i]
265				i = i + 2 # Skip colon and space
266			j = -1
267			if l[j] == '\n':
268				j -= 1
269			brackets = l[i] + l[j]
270			l = l[i+1:-2]
271			if brackets == '()':
272				self.text = l
273			elif brackets == '<>':
274				self.unicodes = Unicode.parse (l)
275			elif brackets == '[]':
276				# XXX we don't handle failed tests here
277				self.glyphs = l
278
279
280class DiffHelpers:
281
282	@staticmethod
283	def separate_test_cases (f):
284		'''Reads lines from f, and if the lines have identifiers, ie.
285		   have a colon character, groups them by identifier,
286		   yielding lists of all lines with the same identifier.'''
287
288		def identifier (l):
289			if ':' in l[1:]:
290				return l[1:l.index (':')]
291			return l
292		return groupby (f, key=identifier)
293
294	@staticmethod
295	def test_passed (lines):
296		lines = list (lines)
297		# XXX This is a hack, but does the job for now.
298		if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
299		if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
300		if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
301		if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
302		if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
303		if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
304		return all (l[0] == ' ' for l in lines)
305
306
307class FilterHelpers:
308
309	@staticmethod
310	def filter_printer_function (filter_callback):
311		def printer (f):
312			for line in filter_callback (f):
313				print line
314		return printer
315
316	@staticmethod
317	def filter_printer_function_no_newline (filter_callback):
318		def printer (f):
319			for line in filter_callback (f):
320				sys.stdout.writelines ([line])
321		return printer
322
323
324class Ngram:
325
326	@staticmethod
327	def generator (n):
328
329		def gen (f):
330			l = []
331			for x in f:
332				l.append (x)
333				if len (l) == n:
334					yield tuple (l)
335					l[:1] = []
336
337		gen.n = n
338		return gen
339
340
341class UtilMains:
342
343	@staticmethod
344	def process_multiple_files (callback, mnemonic = "FILE"):
345
346		if "--help" in sys.argv:
347			print "Usage: %s %s..." % (sys.argv[0], mnemonic)
348			sys.exit (1)
349
350		try:
351			files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
352			for s in files:
353				callback (FileHelpers.open_file_or_stdin (s))
354		except IOError as e:
355			if e.errno != errno.EPIPE:
356				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
357				sys.exit (1)
358
359	@staticmethod
360	def process_multiple_args (callback, mnemonic):
361
362		if len (sys.argv) == 1 or "--help" in sys.argv:
363			print "Usage: %s %s..." % (sys.argv[0], mnemonic)
364			sys.exit (1)
365
366		try:
367			for s in sys.argv[1:]:
368				callback (s)
369		except IOError as e:
370			if e.errno != errno.EPIPE:
371				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
372				sys.exit (1)
373
374	@staticmethod
375	def filter_multiple_strings_or_stdin (callback, mnemonic, \
376					      separator = " ", \
377					      concat_separator = False):
378
379		if "--help" in sys.argv:
380			print "Usage:\n  %s %s...\nor:\n  %s\n\nWhen called with no arguments, input is read from standard input." \
381			      % (sys.argv[0], mnemonic, sys.argv[0])
382			sys.exit (1)
383
384		try:
385			if len (sys.argv) == 1:
386				while (1):
387					line = sys.stdin.readline ()
388					if not len (line):
389						break
390					if line[-1] == '\n':
391						line = line[:-1]
392					print callback (line)
393			else:
394				args = sys.argv[1:]
395				if concat_separator != False:
396					args = [concat_separator.join (args)]
397				print separator.join (callback (x) for x in (args))
398		except IOError as e:
399			if e.errno != errno.EPIPE:
400				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
401				sys.exit (1)
402
403
404class Unicode:
405
406	@staticmethod
407	def decode (s):
408		return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
409
410	@staticmethod
411	def parse (s):
412		s = re.sub (r"0[xX]", " ", s)
413		s = re.sub (r"[<+>,;&#\\xXuU\n	]", " ", s)
414		return [int (x, 16) for x in s.split ()]
415
416	@staticmethod
417	def encode (s):
418		return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
419
420	shorthands = {
421		"ZERO WIDTH NON-JOINER": "ZWNJ",
422		"ZERO WIDTH JOINER": "ZWJ",
423		"NARROW NO-BREAK SPACE": "NNBSP",
424		"COMBINING GRAPHEME JOINER": "CGJ",
425		"LEFT-TO-RIGHT MARK": "LRM",
426		"RIGHT-TO-LEFT MARK": "RLM",
427		"LEFT-TO-RIGHT EMBEDDING": "LRE",
428		"RIGHT-TO-LEFT EMBEDDING": "RLE",
429		"POP DIRECTIONAL FORMATTING": "PDF",
430		"LEFT-TO-RIGHT OVERRIDE": "LRO",
431		"RIGHT-TO-LEFT OVERRIDE": "RLO",
432	}
433
434	@staticmethod
435	def pretty_name (u):
436		try:
437			s = unicodedata.name (u)
438		except ValueError:
439			return "XXX"
440		s = re.sub (".* LETTER ", "", s)
441		s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
442		s = re.sub (".* SIGN ", "", s)
443		s = re.sub (".* COMBINING ", "", s)
444		if re.match (".* VIRAMA", s):
445			s = "HALANT"
446		if s in Unicode.shorthands:
447			s = Unicode.shorthands[s]
448		return s
449
450	@staticmethod
451	def pretty_names (s):
452		s = re.sub (r"[<+>\\uU]", " ", s)
453		s = re.sub (r"0[xX]", " ", s)
454		s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
455		return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
456
457
458class FileHelpers:
459
460	@staticmethod
461	def open_file_or_stdin (f):
462		if f == '-':
463			return sys.stdin
464		return file (f)
465
466
467class Manifest:
468
469	@staticmethod
470	def read (s, strict = True):
471
472		if not os.path.exists (s):
473			if strict:
474				print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s)
475				sys.exit (1)
476			return
477
478		s = os.path.normpath (s)
479
480		if os.path.isdir (s):
481
482			try:
483				m = file (os.path.join (s, "MANIFEST"))
484				items = [x.strip () for x in m.readlines ()]
485				for f in items:
486					for p in Manifest.read (os.path.join (s, f)):
487						yield p
488			except IOError:
489				if strict:
490					print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST"))
491					sys.exit (1)
492				return
493		else:
494			yield s
495
496	@staticmethod
497	def update_recursive (s):
498
499		for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
500
501			for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
502				if f in dirnames:
503					dirnames.remove (f)
504				if f in filenames:
505					filenames.remove (f)
506			dirnames.sort ()
507			filenames.sort ()
508			ms = os.path.join (dirpath, "MANIFEST")
509			print "  GEN    %s" % ms
510			m = open (ms, "w")
511			for f in filenames:
512				print >> m, f
513			for f in dirnames:
514				print >> m, f
515			for f in dirnames:
516				Manifest.update_recursive (os.path.join (dirpath, f))
517
518if __name__ == '__main__':
519	pass
520