1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# compose-parse.py, version 1.3
5#
6# multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7# the script produces statistics and information about the whole process, run with --help for more.
8#
9# You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10#
11# Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13from re			import findall, match, split, sub
14from string		import atoi
15from unicodedata	import normalize
16from urllib 		import urlretrieve
17from os.path		import isfile, getsize
18from copy 		import copy
19
20import sys
21import getopt
22
23# We grab files off the web, left and right.
24URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt'
28FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30# We currently support keysyms of size 2; once upstream xorg gets sorted,
31# we might produce some tables with size 2 and some with size 4.
32SIZEOFINT = 2
33
34# Current max compose sequence length; in case it gets increased.
35WIDTHOFCOMPOSETABLE = 5
36
37keysymdatabase = {}
38keysymunicodedatabase = {}
39unicodedatabase = {}
40
41headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
43 *
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
48 *
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52 * Lesser General Public License for more details.
53 *
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library; if not, write to the
56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57 * Boston, MA 02111-1307, USA.
58 */
59
60/*
61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62 * using the input files
63 *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64 *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65 *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
66 *
67 * This table is optimised for space and requires special handling to access the content.
68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
69 *
70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
72 */
73
74/*
75 * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
76 * file for a list of people on the GTK+ Team.  See the ChangeLog
77 * files for a list of changes.  These files are distributed with
78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
79 */
80
81#ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82#define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
83
84/* === These are the original comments of the file; we keep for historical purposes ===
85 *
86 * The following table was generated from the X compose tables include with
87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88 * to obtain the relevant perl scripts.
89 *
90 * The following compose letter letter sequences confliced
91 *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92 *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
93 *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
94 *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
95 *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
96 *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
97 *
98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99 *   spanish. atilde and otilde are used at least for Portuguese ]
100 *
101 *   at and Aring; resolved to Aring                                          [ AA ]
102 *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
103 *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
104 *
105 * This probably should be resolved by first checking an additional set of compose tables
106 * that depend on the locale or selected input method.
107 */
108
109static const guint16 gtk_compose_seqs_compact[] = {"""
110
111headerfile_end = """};
112
113#endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
114"""
115
116def stringtohex(str): return atoi(str, 16)
117
118def factorial(n):
119	if n <= 1:
120		return 1
121	else:
122		return n * factorial(n-1)
123
124def uniq(*args) :
125	""" Performs a uniq operation on a list or lists """
126    	theInputList = []
127    	for theList in args:
128    	   theInputList += theList
129    	theFinalList = []
130    	for elem in theInputList:
131		if elem not in theFinalList:
132          		theFinalList.append(elem)
133    	return theFinalList
134
135
136
137def all_permutations(seq):
138	""" Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139	""" Produces all permutations of the items of a list """
140    	if len(seq) <=1:
141    	    yield seq
142    	else:
143    	    for perm in all_permutations(seq[1:]):
144    	        for i in range(len(perm)+1):
145    	            #nb str[0:1] works in both string and list contexts
146        	        yield perm[:i] + seq[0:1] + perm[i:]
147
148def usage():
149	print """compose-parse available parameters:
150	-h, --help		this craft
151	-s, --statistics	show overall statistics (both algorithmic, non-algorithmic)
152	-a, --algorithmic	show sequences saved with algorithmic optimisation
153	-g, --gtk		show entries that go to GTK+
154	-u, --unicodedatatxt	show compose sequences derived from UnicodeData.txt (from unicode.org)
155	-v, --verbose		show verbose output
156        -p, --plane1		show plane1 compose sequences
157	-n, --numeric		when used with --gtk, create file with numeric values only
158	-e, --gtk-expanded	when used with --gtk, create file that repeats first column; not usable in GTK+
159	--all-sequences		when used with --gtk, create file with entries rejected by default
160	Default is to show statistics.
161	"""
162
163try:
164	opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
165		"stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"])
166except:
167	usage()
168	sys.exit(2)
169
170opt_statistics = False
171opt_algorithmic = False
172opt_gtk = False
173opt_unicodedatatxt = False
174opt_verbose = False
175opt_plane1 = False
176opt_numeric = False
177opt_gtkexpanded = False
178opt_allsequences = False
179
180for o, a in opts:
181	if o in ("-h", "--help"):
182		usage()
183		sys.exit()
184	if o in ("-s", "--statistics"):
185		opt_statistics = True
186	if o in ("-a", "--algorithmic"):
187		opt_algorithmic = True
188	if o in ("-g", "--gtk"):
189		opt_gtk = True
190	if o in ("-u", "--unicodedatatxt"):
191		opt_unicodedatatxt = True
192	if o in ("-v", "--verbose"):
193		opt_verbose = True
194	if o in ("-p", "--plane1"):
195		opt_plane1 = True
196	if o in ("-n", "--numeric"):
197		opt_numeric = True
198	if o in ("-e", "--gtk-expanded"):
199		opt_gtkexpanded = True
200	if o == "--all-sequences":
201		opt_allsequences = True
202
203if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
204	opt_statistics = True
205
206def download_hook(blocks_transferred, block_size, file_size):
207	""" A download hook to provide some feedback when downloading """
208	if blocks_transferred == 0:
209		if file_size > 0:
210			if opt_verbose:
211				print "Downloading", file_size, "bytes: ",
212		else:
213			if opt_verbose:
214				print "Downloading: ",
215	sys.stdout.write('#')
216	sys.stdout.flush()
217
218
219def download_file(url):
220	""" Downloads a file provided a URL. Returns the filename. """
221	""" Borks on failure """
222	localfilename = url.split('/')[-1]
223        if not isfile(localfilename) or getsize(localfilename) <= 0:
224		if opt_verbose:
225			print "Downloading ", url, "..."
226		try:
227			urlretrieve(url, localfilename, download_hook)
228		except IOError, (errno, strerror):
229			print "I/O error(%s): %s" % (errno, strerror)
230			sys.exit(-1)
231		except:
232			print "Unexpected error: ", sys.exc_info()[0]
233			sys.exit(-1)
234		print " done."
235        else:
236		if opt_verbose:
237                	print "Using cached file for ", url
238	return localfilename
239
240def process_gdkkeysymsh():
241	""" Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
242	""" Fills up keysymdb with contents """
243	filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
244	try:
245		gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
246	except IOError, (errno, strerror):
247		print "I/O error(%s): %s" % (errno, strerror)
248		sys.exit(-1)
249	except:
250		print "Unexpected error: ", sys.exc_info()[0]
251		sys.exit(-1)
252
253	""" Parse the gdkkeysyms.h file and place contents in  keysymdb """
254	linenum_gdkkeysymsh = 0
255	keysymdb = {}
256	for line in gdkkeysymsh.readlines():
257		linenum_gdkkeysymsh += 1
258		line = line.strip()
259		if line == "" or not match('^#define GDK_KEY_', line):
260			continue
261		components = split('\s+', line)
262		if len(components) < 3:
263			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
264			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
265			print "Was expecting 3 items in the line"
266			sys.exit(-1)
267		if not match('^GDK_KEY_', components[1]):
268			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
269			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
270			print "Was expecting a keysym starting with GDK_KEY_"
271			sys.exit(-1)
272		if match('^0x[0-9a-fA-F]+$', components[2]):
273			unival = long(components[2][2:], 16)
274			if unival == 0:
275				continue
276			keysymdb[components[1][8:]] = unival
277		else:
278			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
279			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
280			print "Was expecting a hexadecimal number at the end of the line"
281			sys.exit(-1)
282	gdkkeysymsh.close()
283
284	""" Patch up the keysymdb with some of our own stuff """
285
286	""" This is for a missing keysym from the currently upstream file """
287	#keysymdb['dead_stroke'] = 0x338
288
289	""" This is for a missing keysym from the currently upstream file """
290	###keysymdb['dead_belowring'] = 0x323
291	###keysymdb['dead_belowmacron'] = 0x331
292	###keysymdb['dead_belowcircumflex'] = 0x32d
293	###keysymdb['dead_belowtilde'] = 0x330
294	###keysymdb['dead_belowbreve'] = 0x32e
295	###keysymdb['dead_belowdiaeresis'] = 0x324
296
297	""" This is^Wwas preferential treatment for Greek """
298	# keysymdb['dead_tilde'] = 0x342
299	""" This is^was preferential treatment for Greek """
300	#keysymdb['combining_tilde'] = 0x342
301
302	""" Fixing VoidSymbol """
303	keysymdb['VoidSymbol'] = 0xFFFF
304
305	return keysymdb
306
307def process_keysymstxt():
308	""" Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
309	""" This file keeps a record between keysyms <-> unicode chars """
310	filename_keysymstxt = download_file(URL_KEYSYMSTXT)
311	try:
312		keysymstxt = open(filename_keysymstxt, 'r')
313	except IOError, (errno, strerror):
314		print "I/O error(%s): %s" % (errno, strerror)
315		sys.exit(-1)
316	except:
317		print "Unexpected error: ", sys.exc_info()[0]
318		sys.exit(-1)
319
320	""" Parse the keysyms.txt file and place content in  keysymdb """
321	linenum_keysymstxt = 0
322	keysymdb = {}
323	for line in keysymstxt.readlines():
324		linenum_keysymstxt += 1
325		line = line.strip()
326		if line == "" or match('^#', line):
327			continue
328		components = split('\s+', line)
329		if len(components) < 5:
330			print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
331			% {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
332			print "Was expecting 5 items in the line"
333			sys.exit(-1)
334		if match('^U[0-9a-fA-F]+$', components[1]):
335			unival = long(components[1][1:], 16)
336		if unival == 0:
337			continue
338		keysymdb[components[4]] = unival
339	keysymstxt.close()
340
341	""" Patch up the keysymdb with some of our own stuff """
342	""" This is for a missing keysym from the currently upstream file """
343	###keysymdb['dead_belowring'] = 0x323
344	###keysymdb['dead_belowmacron'] = 0x331
345	###keysymdb['dead_belowcircumflex'] = 0x32d
346	###keysymdb['dead_belowtilde'] = 0x330
347	###keysymdb['dead_belowbreve'] = 0x32e
348	###keysymdb['dead_belowdiaeresis'] = 0x324
349
350	""" This is preferential treatment for Greek """
351	""" => we get more savings if used for Greek """
352	# keysymdb['dead_tilde'] = 0x342
353	""" This is preferential treatment for Greek """
354	# keysymdb['combining_tilde'] = 0x342
355
356	""" This is for a missing keysym from Markus Kuhn's db """
357	keysymdb['dead_stroke'] = 0x338
358	""" This is for a missing keysym from Markus Kuhn's db """
359	keysymdb['Oslash'] = 0x0d8
360	""" This is for a missing keysym from Markus Kuhn's db """
361	keysymdb['Ssharp'] = 0x1e9e
362
363	""" This is for a missing (recently added) keysym """
364	keysymdb['dead_psili'] = 0x313
365	""" This is for a missing (recently added) keysym """
366	keysymdb['dead_dasia'] = 0x314
367
368	""" Allows to import Multi_key sequences """
369	keysymdb['Multi_key'] = 0xff20
370
371        keysymdb['zerosubscript'] = 0x2080
372        keysymdb['onesubscript'] = 0x2081
373        keysymdb['twosubscript'] = 0x2082
374        keysymdb['threesubscript'] = 0x2083
375        keysymdb['foursubscript'] = 0x2084
376        keysymdb['fivesubscript'] = 0x2085
377        keysymdb['sixsubscript'] = 0x2086
378        keysymdb['sevensubscript'] = 0x2087
379        keysymdb['eightsubscript'] = 0x2088
380        keysymdb['ninesubscript'] = 0x2089
381        keysymdb['dead_doublegrave'] = 0x030F
382        keysymdb['dead_invertedbreve'] = 0x0311
383
384	return keysymdb
385
386def keysymvalue(keysym, file = "n/a", linenum = 0):
387	""" Extracts a value from the keysym """
388	""" Find the value of keysym, using the data from keysyms """
389	""" Use file and linenum to when reporting errors """
390	if keysym == "":
391		return 0
392       	if keysymdatabase.has_key(keysym):
393               	return keysymdatabase[keysym]
394       	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
395               	return atoi(keysym[1:], 16)
396       	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
397		return atoi(keysym[2:], 16)
398	else:
399        	print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
400               	#return -1
401		sys.exit(-1)
402
403def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
404	""" Extracts a value from the keysym """
405	""" Find the value of keysym, using the data from keysyms """
406	""" Use file and linenum to when reporting errors """
407	if keysym == "":
408		return 0
409       	if keysymunicodedatabase.has_key(keysym):
410               	return keysymunicodedatabase[keysym]
411       	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
412               	return atoi(keysym[1:], 16)
413       	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
414		return atoi(keysym[2:], 16)
415	else:
416        	print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
417               	sys.exit(-1)
418
419def rename_combining(seq):
420	filtered_sequence = []
421	for ks in seq:
422		if findall('^combining_', ks):
423			ks = sub('^combining_', 'dead_', ks)
424                if ks == 'dead_double_grave':
425                        ks = 'dead_doublegrave'
426                if ks == 'dead_inverted_breve':
427                        ks = 'dead_invertedbreve'
428		filtered_sequence.append(ks)
429	return filtered_sequence
430
431
432keysymunicodedatabase = process_keysymstxt()
433keysymdatabase = process_gdkkeysymsh()
434
435""" Grab and open the compose file from upstream """
436filename_compose = download_file(URL_COMPOSE)
437try:
438	composefile = open(filename_compose, 'r')
439except IOError, (errno, strerror):
440	print "I/O error(%s): %s" % (errno, strerror)
441	sys.exit(-1)
442except:
443	print "Unexpected error: ", sys.exc_info()[0]
444	sys.exit(-1)
445
446""" Look if there is a lookaside (supplementary) compose file in the current
447    directory, and if so, open, then merge with upstream Compose file.
448"""
449xorg_compose_sequences_raw = []
450for seq in composefile.readlines():
451        xorg_compose_sequences_raw.append(seq)
452
453try:
454        composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
455        for seq in composefile_lookaside.readlines():
456                xorg_compose_sequences_raw.append(seq)
457except IOError, (errno, strerror):
458        if opt_verbose:
459                print "I/O error(%s): %s" % (errno, strerror)
460                print "Did not find lookaside compose file. Continuing..."
461except:
462        print "Unexpected error: ", sys.exc_info()[0]
463        sys.exit(-1)
464
465""" Parse the compose file in  xorg_compose_sequences"""
466xorg_compose_sequences = []
467xorg_compose_sequences_algorithmic = []
468linenum_compose = 0
469comment_nest_depth = 0
470for line in xorg_compose_sequences_raw:
471	linenum_compose += 1
472	line = line.strip()
473	if match("^XCOMM", line) or match("^#", line):
474		continue
475
476	line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
477
478	comment_start = line.find("/*")
479
480	if comment_start >= 0:
481		if comment_nest_depth == 0:
482			line = line[:comment_start]
483		else:
484			line = ""
485
486		comment_nest_depth += 1
487	else:
488		comment_end = line.find("*/")
489
490		if comment_end >= 0:
491			comment_nest_depth -= 1
492
493		if comment_nest_depth < 0:
494			print "Invalid comment %(linenum_compose)d in %(filename)s: \
495			Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
496			exit(-1)
497
498		if comment_nest_depth > 0:
499			line = ""
500		else:
501			line = line[comment_end + 2:]
502
503	if line is "":
504		continue
505
506	#line = line[:-1]
507	components = split(':', line)
508	if len(components) != 2:
509		print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
510		/value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
511		exit(-1)
512	(seq, val ) = split(':', line)
513	seq = seq.strip()
514	val = val.strip()
515	raw_sequence = findall('\w+', seq)
516	values = split('\s+', val)
517	unichar_temp = split('"', values[0])
518	unichar = unichar_temp[1]
519	if len(values) == 1:
520		continue
521	codepointstr = values[1]
522	if values[1] == '#':
523		# No codepoints that are >1 characters yet.
524		continue
525	if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
526		raw_sequence[0] = '0x' + raw_sequence[0][1:]
527	if  match('^U[0-9a-fA-F]+$', codepointstr):
528		codepoint = long(codepointstr[1:], 16)
529	elif keysymunicodedatabase.has_key(codepointstr):
530		#if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
531			#print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
532			#print raw_sequence, codepointstr
533		codepoint = keysymunicodedatabase[codepointstr]
534	else:
535		print
536		print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
537		 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
538		exit(-1)
539	sequence = rename_combining(raw_sequence)
540	reject_this = False
541	for i in sequence:
542		if keysymvalue(i) > 0xFFFF:
543			reject_this = True
544			if opt_plane1:
545				print sequence
546			break
547		if keysymvalue(i) < 0:
548			reject_this = True
549			break
550	if reject_this:
551		continue
552	if "U0342" in sequence or \
553		"U0313" in sequence or \
554		"U0314" in sequence or \
555		"0x0313" in sequence or \
556		"0x0342" in sequence or \
557		"0x0314" in sequence:
558		continue
559	if "dead_belowring" in sequence or\
560                "dead_currency" in sequence or\
561		"dead_belowcomma" in sequence or\
562		"dead_belowmacron" in sequence or\
563		"dead_belowtilde" in sequence or\
564		"dead_belowbreve" in sequence or\
565		"dead_belowdiaeresis" in sequence or\
566		"dead_belowcircumflex" in sequence:
567		continue
568	#for i in range(len(sequence)):
569	#	if sequence[i] == "0x0342":
570	#		sequence[i] = "dead_tilde"
571	if "Multi_key" not in sequence:
572		""" Ignore for now >0xFFFF keysyms """
573		if codepoint < 0xFFFF:
574			original_sequence = copy(sequence)
575			stats_sequence = copy(sequence)
576			base = sequence.pop()
577			basechar = keysymvalue(base, filename_compose, linenum_compose)
578
579			if basechar < 0xFFFF:
580				counter = 1
581				unisequence = []
582				not_normalised = True
583				skipping_this = False
584				for i in range(0, len(sequence)):
585					""" If the sequence has dead_tilde and is for Greek, we don't do algorithmically
586					    because of lack of dead_perispomeni (i.e. conflict)
587					"""
588					bc = basechar
589					"""if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
590						skipping_this = True
591						break
592					if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
593						skipping_this = True
594						break
595					if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
596						skipping_this = True
597						break
598					if sequence[-1] == "dead_psili":
599						sequence[i] = "dead_horn"
600					if sequence[-1] == "dead_dasia":
601						sequence[-1] = "dead_ogonek"
602					"""
603					unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
604
605				if skipping_this:
606					unisequence = []
607				for perm in all_permutations(unisequence):
608					# print counter, original_sequence, unichr(basechar) + "".join(perm)
609					# print counter, map(unichr, perm)
610					normalized = normalize('NFC', unichr(basechar) + "".join(perm))
611					if len(normalized) == 1:
612						# print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
613						# % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
614						# print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
615						stats_sequence_data = map(keysymunicodevalue, stats_sequence)
616						stats_sequence_data.append(normalized)
617						xorg_compose_sequences_algorithmic.append(stats_sequence_data)
618						not_normalised = False
619						break;
620					counter += 1
621				if not_normalised or opt_allsequences:
622					original_sequence.append(codepoint)
623					xorg_compose_sequences.append(original_sequence)
624					""" print xorg_compose_sequences[-1] """
625
626			else:
627				print "Error in base char !?!"
628				exit(-2)
629		else:
630			print "OVER", sequence
631			exit(-1)
632	else:
633		sequence.append(codepoint)
634		xorg_compose_sequences.append(sequence)
635		""" print xorg_compose_sequences[-1] """
636
637def sequence_cmp(x, y):
638	if keysymvalue(x[0]) > keysymvalue(y[0]):
639		return 1
640	elif keysymvalue(x[0]) < keysymvalue(y[0]):
641		return -1
642	elif len(x) > len(y):
643		return 1
644	elif len(x) < len(y):
645		return -1
646	elif keysymvalue(x[1]) > keysymvalue(y[1]):
647		return 1
648	elif keysymvalue(x[1]) < keysymvalue(y[1]):
649		return -1
650	elif len(x) < 4:
651		return 0
652	elif keysymvalue(x[2]) > keysymvalue(y[2]):
653		return 1
654	elif keysymvalue(x[2]) < keysymvalue(y[2]):
655		return -1
656	elif len(x) < 5:
657		return 0
658	elif keysymvalue(x[3]) > keysymvalue(y[3]):
659		return 1
660	elif keysymvalue(x[3]) < keysymvalue(y[3]):
661		return -1
662	elif len(x) < 6:
663		return 0
664	elif keysymvalue(x[4]) > keysymvalue(y[4]):
665		return 1
666	elif keysymvalue(x[4]) < keysymvalue(y[4]):
667		return -1
668	else:
669		return 0
670
671def sequence_unicode_cmp(x, y):
672	if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
673		return 1
674	elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
675		return -1
676	elif len(x) > len(y):
677		return 1
678	elif len(x) < len(y):
679		return -1
680	elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
681		return 1
682	elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
683		return -1
684	elif len(x) < 4:
685		return 0
686	elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
687		return 1
688	elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
689		return -1
690	elif len(x) < 5:
691		return 0
692	elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
693		return 1
694	elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
695		return -1
696	elif len(x) < 6:
697		return 0
698	elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
699		return 1
700	elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
701		return -1
702	else:
703		return 0
704
705def sequence_algorithmic_cmp(x, y):
706	if len(x) < len(y):
707		return -1
708	elif len(x) > len(y):
709		return 1
710	else:
711		for i in range(len(x)):
712			if x[i] < y[i]:
713				return -1
714			elif x[i] > y[i]:
715				return 1
716	return 0
717
718
719xorg_compose_sequences.sort(sequence_cmp)
720
721xorg_compose_sequences_uniqued = []
722first_time = True
723item = None
724for next_item in xorg_compose_sequences:
725	if first_time:
726		first_time = False
727		item = next_item
728	if sequence_unicode_cmp(item, next_item) != 0:
729		xorg_compose_sequences_uniqued.append(item)
730	item = next_item
731
732xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
733
734counter_multikey = 0
735for item in xorg_compose_sequences:
736	if findall('Multi_key', "".join(item[:-1])) != []:
737		counter_multikey += 1
738
739xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
740xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
741
742firstitem = ""
743num_first_keysyms = 0
744zeroes = 0
745num_entries = 0
746num_algorithmic_greek = 0
747for sequence in xorg_compose_sequences:
748	if keysymvalue(firstitem) != keysymvalue(sequence[0]):
749		firstitem = sequence[0]
750		num_first_keysyms += 1
751	zeroes += 6 - len(sequence) + 1
752	num_entries += 1
753
754for sequence in xorg_compose_sequences_algorithmic_uniqued:
755	ch = ord(sequence[-1:][0])
756	if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
757		num_algorithmic_greek += 1
758
759
760if opt_algorithmic:
761	for sequence in xorg_compose_sequences_algorithmic_uniqued:
762		letter = "".join(sequence[-1:])
763		print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
764		for elem in sequence[:-2]:
765			print "<0x%(keysym)04X>," % { 'keysym': elem },
766		""" Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
767		print "], recomposed as", letter.encode('utf-8'), "verified"
768
769def num_of_keysyms(seq):
770	return len(seq) - 1
771
772def convert_UnotationToHex(arg):
773	if isinstance(arg, str):
774		if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
775			return sub('^U', '0x', arg)
776	return arg
777
778def addprefix_GDK(arg):
779	if match('^0x', arg):
780		return '%(arg)s, ' % { 'arg': arg }
781	else:
782		return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
783
784if opt_gtk:
785	first_keysym = ""
786	sequence = []
787	compose_table = []
788	ct_second_part = []
789	ct_sequence_width = 2
790	start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
791	we_finished = False
792	counter = 0
793
794	sequence_iterator = iter(xorg_compose_sequences)
795	sequence = sequence_iterator.next()
796	while True:
797		first_keysym = sequence[0]					# Set the first keysym
798		compose_table.append([first_keysym, 0, 0, 0, 0, 0])
799		while sequence[0] == first_keysym:
800			compose_table[counter][num_of_keysyms(sequence)-1] += 1
801			try:
802				sequence = sequence_iterator.next()
803			except StopIteration:
804				we_finished = True
805				break
806		if we_finished:
807			break
808		counter += 1
809
810	ct_index = start_offset
811	for line_num in range(len(compose_table)):
812		for i in range(WIDTHOFCOMPOSETABLE):
813			occurences = compose_table[line_num][i+1]
814			compose_table[line_num][i+1] = ct_index
815			ct_index += occurences * (i+2)
816
817	for sequence in xorg_compose_sequences:
818		ct_second_part.append(map(convert_UnotationToHex, sequence))
819
820	print headerfile_start
821	for i in compose_table:
822		if opt_gtkexpanded:
823			print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
824			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
825		elif not match('^0x', i[0]):
826			print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
827		else:
828			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
829	for i in ct_second_part:
830		if opt_numeric:
831			for ks in i[1:][:-1]:
832				print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
833			print '0x%(cp)04X, ' % { 'cp':i[-1] }
834			"""
835			for ks in i[:-1]:
836				print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
837			print '0x%(cp)04X, ' % { 'cp':i[-1] }
838			"""
839		elif opt_gtkexpanded:
840			print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
841		else:
842			print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
843	print headerfile_end
844
845def redecompose(codepoint):
846	(name, decomposition, combiningclass) = unicodedatabase[codepoint]
847	if decomposition[0] == '' or decomposition[0] == '0':
848		return [codepoint]
849	if match('<\w+>', decomposition[0]):
850		numdecomposition = map(stringtohex, decomposition[1:])
851		return map(redecompose, numdecomposition)
852	numdecomposition = map(stringtohex, decomposition)
853	return map(redecompose, numdecomposition)
854
855def process_unicodedata_file(verbose = False):
856	""" Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
857	filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
858	try:
859		unicodedatatxt = open(filename_unicodedatatxt, 'r')
860	except IOError, (errno, strerror):
861		print "I/O error(%s): %s" % (errno, strerror)
862		sys.exit(-1)
863	except:
864		print "Unexpected error: ", sys.exc_info()[0]
865		sys.exit(-1)
866	for line in unicodedatatxt.readlines():
867		if line[0] == "" or line[0] == '#':
868			continue
869		line = line[:-1]
870		uniproperties = split(';', line)
871		codepoint = stringtohex(uniproperties[0])
872		""" We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
873		if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
874			continue
875		name = uniproperties[1]
876		category = uniproperties[2]
877		combiningclass = uniproperties[3]
878		decomposition = uniproperties[5]
879		unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
880
881	counter_combinations = 0
882	counter_combinations_greek = 0
883	counter_entries = 0
884	counter_entries_greek = 0
885
886	for item in unicodedatabase.keys():
887		(name, decomposition, combiningclass) = unicodedatabase[item]
888		if decomposition[0] == '':
889			continue
890			print name, "is empty"
891		elif match('<\w+>', decomposition[0]):
892			continue
893			print name, "has weird", decomposition[0]
894		else:
895			sequence = map(stringtohex, decomposition)
896			chrsequence = map(unichr, sequence)
897			normalized = normalize('NFC', "".join(chrsequence))
898
899			""" print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
900			decomposedsequence = []
901			for subseq in map(redecompose, sequence):
902				for seqitem in subseq:
903					if isinstance(seqitem, list):
904						for i in seqitem:
905							if isinstance(i, list):
906								for j in i:
907									decomposedsequence.append(j)
908							else:
909								decomposedsequence.append(i)
910					else:
911						decomposedsequence.append(seqitem)
912			recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
913			if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
914				counter_entries += 1
915				counter_combinations += factorial(len(decomposedsequence)-1)
916				ch = item
917				if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
918					counter_entries_greek += 1
919					counter_combinations_greek += factorial(len(decomposedsequence)-1)
920				if verbose:
921					print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
922					print "[",
923					for elem in decomposedsequence:
924						print '<0x%(hex)04X>,' % { 'hex': elem },
925					print "], recomposed as", recomposedchar,
926					if unichr(item) == recomposedchar:
927						print "verified"
928
929	if verbose == False:
930		print "Unicode statistics from UnicodeData.txt"
931		print "Number of entries that can be algorithmically produced     :", counter_entries
932		print "  of which are for Greek                                   :", counter_entries_greek
933		print "Number of compose sequence combinations requiring          :", counter_combinations
934		print "  of which are for Greek                                   :", counter_combinations_greek
935		print "Note: We do not include partial compositions, "
936		print "thus the slight discrepancy in the figures"
937		print
938
939if opt_unicodedatatxt:
940	process_unicodedata_file(True)
941
942if opt_statistics:
943	print
944	print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
945	print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
946	print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
947	print "    of which have Multi_key                                :", counter_multikey
948	print
949	print "Algorithmic (stats for Xorg Compose file)"
950	print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
951	print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
952	print "  of which are for Greek                                   :", num_algorithmic_greek
953	print
954	process_unicodedata_file()
955	print "Not algorithmic (stats from Xorg Compose file)"
956	print "Number of sequences                                        :", len(xorg_compose_sequences)
957	print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
958	print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
959	print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
960	print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
961	print "Number of different first items                            :", num_first_keysyms
962	print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
963	print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
964	print
965	print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
966	print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
967	print
968	print "Existing (old) implementation in GTK+"
969	print "Number of sequences in old gtkimcontextsimple.c            :", 691
970	print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"
971