gen-indic-table.py revision ae4a2b9365051c23c9a299cf76f3ab7e661999b1
1#!/usr/bin/python
2
3import sys
4
5if len (sys.argv) != 4:
6	print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
7	sys.exit (1)
8
9files = [file (x) for x in sys.argv[1:]]
10
11headers = [[f.readline () for i in range (2)] for f in files]
12
13blocks = {}
14data = [{} for f in files]
15values = [{} for f in files]
16for i, f in enumerate (files):
17	for line in f:
18
19		j = line.find ('#')
20		if j >= 0:
21			line = line[:j]
22
23		fields = [x.strip () for x in line.split (';')]
24		if len (fields) == 1:
25			continue
26
27		uu = fields[0].split ('..')
28		start = int (uu[0], 16)
29		if len (uu) == 1:
30			end = start
31		else:
32			end = int (uu[1], 16)
33
34		t = fields[1]
35
36		for u in range (start, end + 1):
37			data[i][u] = t
38		values[i][t] = values[i].get (t, 0) + 1
39
40		if i == 2:
41			blocks[t] = (start, end)
42
43# Merge data into one dict:
44defaults = ('Other', 'Not_Applicable', 'No_Block')
45for i,v in enumerate (defaults):
46	values[i][v] = values[i].get (v, 0) + 1
47combined = {}
48for i,d in enumerate (data):
49	for u,v in d.items ():
50		if i == 2 and not u in combined:
51			continue
52		if not u in combined:
53			combined[u] = list (defaults)
54		combined[u][i] = v
55data = combined
56del combined
57num = len (data)
58
59# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
60singles = {}
61for u in [0x00A0, 0x25CC]:
62	singles[u] = data[u]
63	del data[u]
64
65print "/* == Start of generated table == */"
66print "/*"
67print " * The following table is generated by running:"
68print " *"
69print " *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
70print " *"
71print " * on files with these headers:"
72print " *"
73for h in headers:
74	for l in h:
75		print " * %s" % (l.strip())
76print " */"
77print
78print "#ifndef HB_OT_SHAPE_COMPLEX_INDIC_TABLE_HH"
79print "#define HB_OT_SHAPE_COMPLEX_INDIC_TABLE_HH"
80print
81
82# Shorten values
83short = [{
84	"Bindu":		'Bi',
85	"Visarga":		'Vs',
86	"Vowel":		'Vo',
87	"Vowel_Dependent":	'M',
88	"Other":		'x',
89},{
90	"Not_Applicable":	'x',
91}]
92all_shorts = [[],[]]
93
94# Add some of the values, to make them more readable, and to avoid duplicates
95
96
97for i in range (2):
98	for v,s in short[i].items ():
99		all_shorts[i].append (s)
100
101what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
102what_short = ["ISC", "IMC"]
103for i in range (2):
104	print
105	vv = values[i].keys ()
106	vv.sort ()
107	for v in vv:
108		v_no_and = v.replace ('_And_', '_')
109		if v in short[i]:
110			s = short[i][v]
111		else:
112			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
113			if s in all_shorts[i]:
114				raise Exception ("Duplicate short value alias", v, s)
115			all_shorts[i].append (s)
116			short[i][v] = s
117		print "#define %s_%s	%s_%s	%s/* %3d chars; %s */" % \
118			(what_short[i], s, what[i], v.upper (), \
119			'	'* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \
120			values[i][v], v)
121print
122print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)"
123print
124print
125
126total = 0
127used = 0
128def print_block (block, start, end, data):
129	print
130	print
131	print "  /* %s  (%04X..%04X) */" % (block, start, end)
132	num = 0
133	for u in range (start, end+1):
134		if u % 8 == 0:
135			print
136			print "  /* %04X */" % u,
137		if u in data:
138			num += 1
139		d = data.get (u, defaults)
140		sys.stdout.write ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])))
141
142	global total, used
143	total += end - start + 1
144	used += num
145
146uu = data.keys ()
147uu.sort ()
148
149last = -1
150num = 0
151offset = 0
152starts = []
153ends = []
154print "static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {"
155for u in uu:
156	if u <= last:
157		continue
158	block = data[u][2]
159	(start, end) = blocks[block]
160
161	if start != last + 1:
162		if start - last <= 33:
163			print_block ("FILLER", last+1, start-1, data)
164			last = start-1
165		else:
166			if last >= 0:
167				ends.append (last + 1)
168				offset += ends[-1] - starts[-1]
169			print
170			print
171			print "#define indic_offset_0x%04x %d" % (start, offset)
172			starts.append (start)
173
174	print_block (block, start, end, data)
175	last = end
176ends.append (last + 1)
177offset += ends[-1] - starts[-1]
178print
179print
180print "#define indic_offset_total %d" % offset
181print
182occupancy = used * 100. / total
183print "}; /* Table occupancy: %d%% */" % occupancy
184print
185print "static INDIC_TABLE_ELEMENT_TYPE"
186print "get_indic_categories (hb_codepoint_t u)"
187print "{"
188for (start,end) in zip (starts, ends):
189	offset = "indic_offset_0x%04x" % start
190	print "  if (0x%04X <= u && u <= 0x%04X) return indic_table[u - 0x%04X + %s];" % (start, end, start, offset)
191for u,d in singles.items ():
192	print "  if (unlikely (u == 0x%04X)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])
193print "  return _(x,x);"
194print "}"
195print
196print "#undef _"
197for i in range (2):
198	print
199	vv = values[i].keys ()
200	vv.sort ()
201	for v in vv:
202		print "#undef %s_%s" % \
203			(what_short[i], short[i][v])
204print
205print "#endif /* HB_OT_SHAPE_COMPLEX_INDIC_TABLE_HH */"
206print
207print "/* == End of generated table == */"
208
209# Maintain at least 30% occupancy in the table */
210if occupancy < 30:
211	raise Exception ("Table too sparse, please investigate: ", occupancy)
212