1#!/usr/bin/python
2
3import sys
4
5if len (sys.argv) != 5:
6	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
7	sys.exit (1)
8
9BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
10
11files = [file (x) for x in sys.argv[1:]]
12
13headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
14headers.append (["UnicodeData.txt does not have a header."])
15
16data = [{} for f in files]
17values = [{} for f in files]
18for i, f in enumerate (files):
19	for line in f:
20
21		j = line.find ('#')
22		if j >= 0:
23			line = line[:j]
24
25		fields = [x.strip () for x in line.split (';')]
26		if len (fields) == 1:
27			continue
28
29		uu = fields[0].split ('..')
30		start = int (uu[0], 16)
31		if len (uu) == 1:
32			end = start
33		else:
34			end = int (uu[1], 16)
35
36		t = fields[1 if i != 2 else 2]
37
38		for u in range (start, end + 1):
39			data[i][u] = t
40		values[i][t] = values[i].get (t, 0) + end - start + 1
41
42defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
43
44# TODO Characters that are not in Unicode Indic files, but used in USE
45data[0][0x034F] = defaults[0]
46data[0][0x2060] = defaults[0]
47for u in range (0xFE00, 0xFE0F + 1):
48	data[0][u] = defaults[0]
49
50# Merge data into one dict:
51for i,v in enumerate (defaults):
52	values[i][v] = values[i].get (v, 0) + 1
53combined = {}
54for i,d in enumerate (data):
55	for u,v in d.items ():
56		if i >= 2 and not u in combined:
57			continue
58		if not u in combined:
59			combined[u] = list (defaults)
60		combined[u][i] = v
61combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
62data = combined
63del combined
64num = len (data)
65
66
67property_names = [
68	# General_Category
69	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
70	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
71	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
72	# Indic_Syllabic_Category
73	'Other',
74	'Bindu',
75	'Visarga',
76	'Avagraha',
77	'Nukta',
78	'Virama',
79	'Pure_Killer',
80	'Invisible_Stacker',
81	'Vowel_Independent',
82	'Vowel_Dependent',
83	'Vowel',
84	'Consonant_Placeholder',
85	'Consonant',
86	'Consonant_Dead',
87	'Consonant_With_Stacker',
88	'Consonant_Prefixed',
89	'Consonant_Preceding_Repha',
90	'Consonant_Succeeding_Repha',
91	'Consonant_Subjoined',
92	'Consonant_Medial',
93	'Consonant_Final',
94	'Consonant_Head_Letter',
95	'Modifying_Letter',
96	'Tone_Letter',
97	'Tone_Mark',
98	'Gemination_Mark',
99	'Cantillation_Mark',
100	'Register_Shifter',
101	'Syllable_Modifier',
102	'Consonant_Killer',
103	'Non_Joiner',
104	'Joiner',
105	'Number_Joiner',
106	'Number',
107	'Brahmi_Joining_Number',
108	# Indic_Positional_Category
109	'Not_Applicable',
110	'Right',
111	'Left',
112	'Visual_Order_Left',
113	'Left_And_Right',
114	'Top',
115	'Bottom',
116	'Top_And_Bottom',
117	'Top_And_Right',
118	'Top_And_Left',
119	'Top_And_Left_And_Right',
120	'Bottom_And_Right',
121	'Top_And_Bottom_And_Right',
122	'Overstruck',
123]
124
125class PropertyValue(object):
126	def __init__(self, name_):
127		self.name = name_
128	def __str__(self):
129		return self.name
130	def __eq__(self, other):
131		return self.name == (other if isinstance(other, basestring) else other.name)
132	def __ne__(self, other):
133		return not (self == other)
134
135property_values = {}
136
137for name in property_names:
138	value = PropertyValue(name)
139	assert value not in property_values
140	assert value not in globals()
141	property_values[name] = value
142globals().update(property_values)
143
144
145def is_BASE(U, UISC, UGC):
146	return (UISC in [Number, Consonant, Consonant_Head_Letter,
147			#SPEC-OUTDATED Consonant_Placeholder,
148			Tone_Letter] or
149		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
150					Consonant_Subjoined, Vowel, Vowel_Dependent]))
151def is_BASE_VOWEL(U, UISC, UGC):
152	return UISC == Vowel_Independent
153def is_BASE_IND(U, UISC, UGC):
154	#SPEC-BROKEN return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
155	return (UISC in [Consonant_Dead, Modifying_Letter] or
156		(UGC == Po and not is_BASE_OTHER(U, UISC, UGC))) # for 104E
157def is_BASE_NUM(U, UISC, UGC):
158	return UISC == Brahmi_Joining_Number
159def is_BASE_OTHER(U, UISC, UGC):
160	if UISC == Consonant_Placeholder: return True #SPEC-OUTDATED
161	return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC,
162		     0x25FB, 0x25FC, 0x25FD, 0x25FE]
163def is_CGJ(U, UISC, UGC):
164	return U == 0x034F
165def is_CONS_FINAL(U, UISC, UGC):
166	return ((UISC == Consonant_Final and UGC != Lo) or
167		UISC == Consonant_Succeeding_Repha)
168def is_CONS_FINAL_MOD(U, UISC, UGC):
169	#SPEC-OUTDATED return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
170	return  UISC == Syllable_Modifier
171def is_CONS_MED(U, UISC, UGC):
172	return UISC == Consonant_Medial and UGC != Lo
173def is_CONS_MOD(U, UISC, UGC):
174	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
175def is_CONS_SUB(U, UISC, UGC):
176	#SPEC-OUTDATED return UISC == Consonant_Subjoined
177	return UISC == Consonant_Subjoined and UGC != Lo
178def is_HALANT(U, UISC, UGC):
179	return UISC in [Virama, Invisible_Stacker]
180def is_HALANT_NUM(U, UISC, UGC):
181	return UISC == Number_Joiner
182def is_ZWNJ(U, UISC, UGC):
183	return UISC == Non_Joiner
184def is_ZWJ(U, UISC, UGC):
185	return UISC == Joiner
186def is_Word_Joiner(U, UISC, UGC):
187	return U == 0x2060
188def is_OTHER(U, UISC, UGC):
189	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
190	return (UISC == Other
191		and not is_SYM_MOD(U, UISC, UGC)
192		and not is_CGJ(U, UISC, UGC)
193		and not is_Word_Joiner(U, UISC, UGC)
194		and not is_VARIATION_SELECTOR(U, UISC, UGC)
195	)
196def is_Reserved(U, UISC, UGC):
197	return UGC == 'Cn'
198def is_REPHA(U, UISC, UGC):
199	#return UISC == Consonant_Preceding_Repha
200	#SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed
201	return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed]
202def is_SYM(U, UISC, UGC):
203	if U == 0x25CC: return False #SPEC-OUTDATED
204	#SPEC-OUTDATED return UGC in [So, Sc] or UISC == Symbol_Letter
205	return UGC in [So, Sc]
206def is_SYM_MOD(U, UISC, UGC):
207	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
208def is_VARIATION_SELECTOR(U, UISC, UGC):
209	return 0xFE00 <= U <= 0xFE0F
210def is_VOWEL(U, UISC, UGC):
211	return (UISC == Pure_Killer or
212		(UGC != Lo and UISC in [Vowel, Vowel_Dependent]))
213def is_VOWEL_MOD(U, UISC, UGC):
214	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
215		(UGC != Lo and UISC == Bindu))
216
217use_mapping = {
218	'B':	is_BASE,
219	'IV':	is_BASE_VOWEL,
220	'IND':	is_BASE_IND,
221	'N':	is_BASE_NUM,
222	'GB':	is_BASE_OTHER,
223	'CGJ':	is_CGJ,
224	'F':	is_CONS_FINAL,
225	'FM':	is_CONS_FINAL_MOD,
226	'M':	is_CONS_MED,
227	'CM':	is_CONS_MOD,
228	'SUB':	is_CONS_SUB,
229	'H':	is_HALANT,
230	'HN':	is_HALANT_NUM,
231	'ZWNJ':	is_ZWNJ,
232	'ZWJ':	is_ZWJ,
233	'WJ':	is_Word_Joiner,
234	'O':	is_OTHER,
235	'Rsv':	is_Reserved,
236	'R':	is_REPHA,
237	'S':	is_SYM,
238	'SM':	is_SYM_MOD,
239	'VS':	is_VARIATION_SELECTOR,
240	'V':	is_VOWEL,
241	'VM':	is_VOWEL_MOD,
242}
243
244use_positions = {
245	'F': {
246		'Abv': [Top],
247		'Blw': [Bottom],
248		'Pst': [Right],
249	},
250	'M': {
251		'Abv': [Top],
252		'Blw': [Bottom],
253		'Pst': [Right],
254		'Pre': [Left],
255	},
256	'CM': {
257		'Abv': [Top],
258		'Blw': [Bottom],
259	},
260	'V': {
261		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
262		'Blw': [Bottom, Overstruck, Bottom_And_Right],
263		'Pst': [Right],
264		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
265	},
266	'VM': {
267		'Abv': [Top],
268		'Blw': [Bottom, Overstruck],
269		'Pst': [Right],
270		'Pre': [Left],
271	},
272	'SM': {
273		'Abv': [Top],
274		'Blw': [Bottom],
275	},
276	'H': None,
277	'B': None,
278	'FM': None,
279	'SUB': None,
280}
281
282def map_to_use(data):
283	out = {}
284	items = use_mapping.items()
285	for U,(UISC,UIPC,UGC,UBlock) in data.items():
286
287		# Resolve Indic_Syllabic_Category
288
289		# TODO: These don't have UISC assigned in Unicode 8.0, but
290		# have UIPC
291		if U == 0x17DD: UISC = Vowel_Dependent
292		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
293
294		# TODO: U+1CED should only be allowed after some of
295		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
296		if U == 0x1CED: UISC = Tone_Mark
297
298		evals = [(k, v(U,UISC,UGC)) for k,v in items]
299		values = [k for k,v in evals if v]
300		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
301		USE = values[0]
302
303		# Resolve Indic_Positional_Category
304
305		# TODO: Not in Unicode 8.0 yet, but in spec.
306		if U == 0x1B6C: UIPC = Bottom
307
308		# TODO: These should die, but have UIPC in Unicode 8.0
309		if U in [0x953, 0x954]: UIPC = Not_Applicable
310
311		# TODO: In USE's override list but not in Unicode 8.0
312		if U == 0x103C: UIPC = Left
313
314		# TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
315		if 0xA926 <= U <= 0xA92A: UIPC = Top
316		if U == 0x111CA: UIPC = Bottom
317		if U == 0x11300: UIPC = Top
318		if U == 0x1133C: UIPC = Bottom
319		if U == 0x1171E: UIPC = Left # Correct?!
320		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
321		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
322
323		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
324			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
325
326		pos_mapping = use_positions.get(USE, None)
327		if pos_mapping:
328			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
329			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
330			USE = USE + values[0]
331
332		out[U] = (USE, UBlock)
333	return out
334
335defaults = ('O', 'No_Block')
336data = map_to_use(data)
337
338# Remove the outliers
339singles = {}
340for u in [0x034F, 0x25CC, 0x1107F]:
341	singles[u] = data[u]
342	del data[u]
343
344print "/* == Start of generated table == */"
345print "/*"
346print " * The following table is generated by running:"
347print " *"
348print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
349print " *"
350print " * on files with these headers:"
351print " *"
352for h in headers:
353	for l in h:
354		print " * %s" % (l.strip())
355print " */"
356print
357print '#include "hb-ot-shape-complex-use-private.hh"'
358print
359
360total = 0
361used = 0
362last_block = None
363def print_block (block, start, end, data):
364	global total, used, last_block
365	if block and block != last_block:
366		print
367		print
368		print "  /* %s */" % block
369		if start % 16:
370			print ' ' * (20 + (start % 16 * 6)),
371	num = 0
372	assert start % 8 == 0
373	assert (end+1) % 8 == 0
374	for u in range (start, end+1):
375		if u % 16 == 0:
376			print
377			print "  /* %04X */" % u,
378		if u in data:
379			num += 1
380		d = data.get (u, defaults)
381		sys.stdout.write ("%6s," % d[0])
382
383	total += end - start + 1
384	used += num
385	if block:
386		last_block = block
387
388uu = data.keys ()
389uu.sort ()
390
391last = -100000
392num = 0
393offset = 0
394starts = []
395ends = []
396for k,v in sorted(use_mapping.items()):
397	if k in use_positions and use_positions[k]: continue
398	print "#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:])
399for k,v in sorted(use_positions.items()):
400	if not v: continue
401	for suf in v.keys():
402		tag = k + suf
403		print "#define %s	USE_%s" % (tag, tag)
404print ""
405print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
406for u in uu:
407	if u <= last:
408		continue
409	block = data[u][1]
410
411	start = u//8*8
412	end = start+1
413	while end in uu and block == data[end][1]:
414		end += 1
415	end = (end-1)//8*8 + 7
416
417	if start != last + 1:
418		if start - last <= 1+16*3:
419			print_block (None, last+1, start-1, data)
420			last = start-1
421		else:
422			if last >= 0:
423				ends.append (last + 1)
424				offset += ends[-1] - starts[-1]
425			print
426			print
427			print "#define use_offset_0x%04xu %d" % (start, offset)
428			starts.append (start)
429
430	print_block (block, start, end, data)
431	last = end
432ends.append (last + 1)
433offset += ends[-1] - starts[-1]
434print
435print
436occupancy = used * 100. / total
437page_bits = 12
438print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
439print
440print "USE_TABLE_ELEMENT_TYPE"
441print "hb_use_get_categories (hb_codepoint_t u)"
442print "{"
443print "  switch (u >> %d)" % page_bits
444print "  {"
445pages = set([u>>page_bits for u in starts+ends+singles.keys()])
446for p in sorted(pages):
447	print "    case 0x%0Xu:" % p
448	for (start,end) in zip (starts, ends):
449		if p not in [start>>page_bits, end>>page_bits]: continue
450		offset = "use_offset_0x%04xu" % start
451		print "      if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
452	for u,d in singles.items ():
453		if p != u>>page_bits: continue
454		print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
455	print "      break;"
456	print ""
457print "    default:"
458print "      break;"
459print "  }"
460print "  return USE_O;"
461print "}"
462print
463for k in sorted(use_mapping.keys()):
464	if k in use_positions and use_positions[k]: continue
465	print "#undef %s" % k
466for k,v in sorted(use_positions.items()):
467	if not v: continue
468	for suf in v.keys():
469		tag = k + suf
470		print "#undef %s" % tag
471print
472print "/* == End of generated table == */"
473
474# Maintain at least 50% occupancy in the table */
475if occupancy < 50:
476	raise Exception ("Table too sparse, please investigate: ", occupancy)
477