1#!/usr/bin/python 2 3import sys 4 5if len (sys.argv) != 5: 6 print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 7 sys.exit (1) 8 9BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] 10 11files = [file (x) for x in sys.argv[1:]] 12 13headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 14headers.append (["UnicodeData.txt does not have a header."]) 15 16data = [{} for f in files] 17values = [{} for f in files] 18for i, f in enumerate (files): 19 for line in f: 20 21 j = line.find ('#') 22 if j >= 0: 23 line = line[:j] 24 25 fields = [x.strip () for x in line.split (';')] 26 if len (fields) == 1: 27 continue 28 29 uu = fields[0].split ('..') 30 start = int (uu[0], 16) 31 if len (uu) == 1: 32 end = start 33 else: 34 end = int (uu[1], 16) 35 36 t = fields[1 if i != 2 else 2] 37 38 for u in range (start, end + 1): 39 data[i][u] = t 40 values[i][t] = values[i].get (t, 0) + end - start + 1 41 42defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 43 44# TODO Characters that are not in Unicode Indic files, but used in USE 45data[0][0x034F] = defaults[0] 46data[0][0x2060] = defaults[0] 47for u in range (0xFE00, 0xFE0F + 1): 48 data[0][u] = defaults[0] 49 50# Merge data into one dict: 51for i,v in enumerate (defaults): 52 values[i][v] = values[i].get (v, 0) + 1 53combined = {} 54for i,d in enumerate (data): 55 for u,v in d.items (): 56 if i >= 2 and not u in combined: 57 continue 58 if not u in combined: 59 combined[u] = list (defaults) 60 combined[u][i] = v 61combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 62data = combined 63del combined 64num = len (data) 65 66 67property_names = [ 68 # General_Category 69 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 70 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 71 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 72 # Indic_Syllabic_Category 73 'Other', 74 'Bindu', 75 'Visarga', 76 'Avagraha', 77 'Nukta', 78 'Virama', 79 'Pure_Killer', 80 'Invisible_Stacker', 81 'Vowel_Independent', 82 'Vowel_Dependent', 83 'Vowel', 84 'Consonant_Placeholder', 85 'Consonant', 86 'Consonant_Dead', 87 'Consonant_With_Stacker', 88 'Consonant_Prefixed', 89 'Consonant_Preceding_Repha', 90 'Consonant_Succeeding_Repha', 91 'Consonant_Subjoined', 92 'Consonant_Medial', 93 'Consonant_Final', 94 'Consonant_Head_Letter', 95 'Modifying_Letter', 96 'Tone_Letter', 97 'Tone_Mark', 98 'Gemination_Mark', 99 'Cantillation_Mark', 100 'Register_Shifter', 101 'Syllable_Modifier', 102 'Consonant_Killer', 103 'Non_Joiner', 104 'Joiner', 105 'Number_Joiner', 106 'Number', 107 'Brahmi_Joining_Number', 108 # Indic_Positional_Category 109 'Not_Applicable', 110 'Right', 111 'Left', 112 'Visual_Order_Left', 113 'Left_And_Right', 114 'Top', 115 'Bottom', 116 'Top_And_Bottom', 117 'Top_And_Right', 118 'Top_And_Left', 119 'Top_And_Left_And_Right', 120 'Bottom_And_Right', 121 'Top_And_Bottom_And_Right', 122 'Overstruck', 123] 124 125class PropertyValue(object): 126 def __init__(self, name_): 127 self.name = name_ 128 def __str__(self): 129 return self.name 130 def __eq__(self, other): 131 return self.name == (other if isinstance(other, basestring) else other.name) 132 def __ne__(self, other): 133 return not (self == other) 134 135property_values = {} 136 137for name in property_names: 138 value = PropertyValue(name) 139 assert value not in property_values 140 assert value not in globals() 141 property_values[name] = value 142globals().update(property_values) 143 144 145def is_BASE(U, UISC, UGC): 146 return (UISC in [Number, Consonant, Consonant_Head_Letter, 147 #SPEC-OUTDATED Consonant_Placeholder, 148 Tone_Letter] or 149 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 150 Consonant_Subjoined, Vowel, Vowel_Dependent])) 151def is_BASE_VOWEL(U, UISC, UGC): 152 return UISC == Vowel_Independent 153def is_BASE_IND(U, UISC, UGC): 154 #SPEC-BROKEN return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 155 return (UISC in [Consonant_Dead, Modifying_Letter] or 156 (UGC == Po and not is_BASE_OTHER(U, UISC, UGC))) # for 104E 157def is_BASE_NUM(U, UISC, UGC): 158 return UISC == Brahmi_Joining_Number 159def is_BASE_OTHER(U, UISC, UGC): 160 if UISC == Consonant_Placeholder: return True #SPEC-OUTDATED 161 return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 162 0x25FB, 0x25FC, 0x25FD, 0x25FE] 163def is_CGJ(U, UISC, UGC): 164 return U == 0x034F 165def is_CONS_FINAL(U, UISC, UGC): 166 return ((UISC == Consonant_Final and UGC != Lo) or 167 UISC == Consonant_Succeeding_Repha) 168def is_CONS_FINAL_MOD(U, UISC, UGC): 169 #SPEC-OUTDATED return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 170 return UISC == Syllable_Modifier 171def is_CONS_MED(U, UISC, UGC): 172 return UISC == Consonant_Medial and UGC != Lo 173def is_CONS_MOD(U, UISC, UGC): 174 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 175def is_CONS_SUB(U, UISC, UGC): 176 #SPEC-OUTDATED return UISC == Consonant_Subjoined 177 return UISC == Consonant_Subjoined and UGC != Lo 178def is_HALANT(U, UISC, UGC): 179 return UISC in [Virama, Invisible_Stacker] 180def is_HALANT_NUM(U, UISC, UGC): 181 return UISC == Number_Joiner 182def is_ZWNJ(U, UISC, UGC): 183 return UISC == Non_Joiner 184def is_ZWJ(U, UISC, UGC): 185 return UISC == Joiner 186def is_Word_Joiner(U, UISC, UGC): 187 return U == 0x2060 188def is_OTHER(U, UISC, UGC): 189 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 190 return (UISC == Other 191 and not is_SYM_MOD(U, UISC, UGC) 192 and not is_CGJ(U, UISC, UGC) 193 and not is_Word_Joiner(U, UISC, UGC) 194 and not is_VARIATION_SELECTOR(U, UISC, UGC) 195 ) 196def is_Reserved(U, UISC, UGC): 197 return UGC == 'Cn' 198def is_REPHA(U, UISC, UGC): 199 #return UISC == Consonant_Preceding_Repha 200 #SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed 201 return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed] 202def is_SYM(U, UISC, UGC): 203 if U == 0x25CC: return False #SPEC-OUTDATED 204 #SPEC-OUTDATED return UGC in [So, Sc] or UISC == Symbol_Letter 205 return UGC in [So, Sc] 206def is_SYM_MOD(U, UISC, UGC): 207 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 208def is_VARIATION_SELECTOR(U, UISC, UGC): 209 return 0xFE00 <= U <= 0xFE0F 210def is_VOWEL(U, UISC, UGC): 211 return (UISC == Pure_Killer or 212 (UGC != Lo and UISC in [Vowel, Vowel_Dependent])) 213def is_VOWEL_MOD(U, UISC, UGC): 214 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 215 (UGC != Lo and UISC == Bindu)) 216 217use_mapping = { 218 'B': is_BASE, 219 'IV': is_BASE_VOWEL, 220 'IND': is_BASE_IND, 221 'N': is_BASE_NUM, 222 'GB': is_BASE_OTHER, 223 'CGJ': is_CGJ, 224 'F': is_CONS_FINAL, 225 'FM': is_CONS_FINAL_MOD, 226 'M': is_CONS_MED, 227 'CM': is_CONS_MOD, 228 'SUB': is_CONS_SUB, 229 'H': is_HALANT, 230 'HN': is_HALANT_NUM, 231 'ZWNJ': is_ZWNJ, 232 'ZWJ': is_ZWJ, 233 'WJ': is_Word_Joiner, 234 'O': is_OTHER, 235 'Rsv': is_Reserved, 236 'R': is_REPHA, 237 'S': is_SYM, 238 'SM': is_SYM_MOD, 239 'VS': is_VARIATION_SELECTOR, 240 'V': is_VOWEL, 241 'VM': is_VOWEL_MOD, 242} 243 244use_positions = { 245 'F': { 246 'Abv': [Top], 247 'Blw': [Bottom], 248 'Pst': [Right], 249 }, 250 'M': { 251 'Abv': [Top], 252 'Blw': [Bottom], 253 'Pst': [Right], 254 'Pre': [Left], 255 }, 256 'CM': { 257 'Abv': [Top], 258 'Blw': [Bottom], 259 }, 260 'V': { 261 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 262 'Blw': [Bottom, Overstruck, Bottom_And_Right], 263 'Pst': [Right], 264 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 265 }, 266 'VM': { 267 'Abv': [Top], 268 'Blw': [Bottom, Overstruck], 269 'Pst': [Right], 270 'Pre': [Left], 271 }, 272 'SM': { 273 'Abv': [Top], 274 'Blw': [Bottom], 275 }, 276 'H': None, 277 'B': None, 278 'FM': None, 279 'SUB': None, 280} 281 282def map_to_use(data): 283 out = {} 284 items = use_mapping.items() 285 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 286 287 # Resolve Indic_Syllabic_Category 288 289 # TODO: These don't have UISC assigned in Unicode 8.0, but 290 # have UIPC 291 if U == 0x17DD: UISC = Vowel_Dependent 292 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 293 294 # TODO: U+1CED should only be allowed after some of 295 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 296 if U == 0x1CED: UISC = Tone_Mark 297 298 evals = [(k, v(U,UISC,UGC)) for k,v in items] 299 values = [k for k,v in evals if v] 300 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 301 USE = values[0] 302 303 # Resolve Indic_Positional_Category 304 305 # TODO: Not in Unicode 8.0 yet, but in spec. 306 if U == 0x1B6C: UIPC = Bottom 307 308 # TODO: These should die, but have UIPC in Unicode 8.0 309 if U in [0x953, 0x954]: UIPC = Not_Applicable 310 311 # TODO: In USE's override list but not in Unicode 8.0 312 if U == 0x103C: UIPC = Left 313 314 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0 315 if 0xA926 <= U <= 0xA92A: UIPC = Top 316 if U == 0x111CA: UIPC = Bottom 317 if U == 0x11300: UIPC = Top 318 if U == 0x1133C: UIPC = Bottom 319 if U == 0x1171E: UIPC = Left # Correct?! 320 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right 321 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 322 323 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 324 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 325 326 pos_mapping = use_positions.get(USE, None) 327 if pos_mapping: 328 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 329 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 330 USE = USE + values[0] 331 332 out[U] = (USE, UBlock) 333 return out 334 335defaults = ('O', 'No_Block') 336data = map_to_use(data) 337 338# Remove the outliers 339singles = {} 340for u in [0x034F, 0x25CC, 0x1107F]: 341 singles[u] = data[u] 342 del data[u] 343 344print "/* == Start of generated table == */" 345print "/*" 346print " * The following table is generated by running:" 347print " *" 348print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 349print " *" 350print " * on files with these headers:" 351print " *" 352for h in headers: 353 for l in h: 354 print " * %s" % (l.strip()) 355print " */" 356print 357print '#include "hb-ot-shape-complex-use-private.hh"' 358print 359 360total = 0 361used = 0 362last_block = None 363def print_block (block, start, end, data): 364 global total, used, last_block 365 if block and block != last_block: 366 print 367 print 368 print " /* %s */" % block 369 if start % 16: 370 print ' ' * (20 + (start % 16 * 6)), 371 num = 0 372 assert start % 8 == 0 373 assert (end+1) % 8 == 0 374 for u in range (start, end+1): 375 if u % 16 == 0: 376 print 377 print " /* %04X */" % u, 378 if u in data: 379 num += 1 380 d = data.get (u, defaults) 381 sys.stdout.write ("%6s," % d[0]) 382 383 total += end - start + 1 384 used += num 385 if block: 386 last_block = block 387 388uu = data.keys () 389uu.sort () 390 391last = -100000 392num = 0 393offset = 0 394starts = [] 395ends = [] 396for k,v in sorted(use_mapping.items()): 397 if k in use_positions and use_positions[k]: continue 398 print "#define %s USE_%s /* %s */" % (k, k, v.__name__[3:]) 399for k,v in sorted(use_positions.items()): 400 if not v: continue 401 for suf in v.keys(): 402 tag = k + suf 403 print "#define %s USE_%s" % (tag, tag) 404print "" 405print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" 406for u in uu: 407 if u <= last: 408 continue 409 block = data[u][1] 410 411 start = u//8*8 412 end = start+1 413 while end in uu and block == data[end][1]: 414 end += 1 415 end = (end-1)//8*8 + 7 416 417 if start != last + 1: 418 if start - last <= 1+16*3: 419 print_block (None, last+1, start-1, data) 420 last = start-1 421 else: 422 if last >= 0: 423 ends.append (last + 1) 424 offset += ends[-1] - starts[-1] 425 print 426 print 427 print "#define use_offset_0x%04xu %d" % (start, offset) 428 starts.append (start) 429 430 print_block (block, start, end, data) 431 last = end 432ends.append (last + 1) 433offset += ends[-1] - starts[-1] 434print 435print 436occupancy = used * 100. / total 437page_bits = 12 438print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy) 439print 440print "USE_TABLE_ELEMENT_TYPE" 441print "hb_use_get_categories (hb_codepoint_t u)" 442print "{" 443print " switch (u >> %d)" % page_bits 444print " {" 445pages = set([u>>page_bits for u in starts+ends+singles.keys()]) 446for p in sorted(pages): 447 print " case 0x%0Xu:" % p 448 for (start,end) in zip (starts, ends): 449 if p not in [start>>page_bits, end>>page_bits]: continue 450 offset = "use_offset_0x%04xu" % start 451 print " if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset) 452 for u,d in singles.items (): 453 if p != u>>page_bits: continue 454 print " if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0]) 455 print " break;" 456 print "" 457print " default:" 458print " break;" 459print " }" 460print " return USE_O;" 461print "}" 462print 463for k in sorted(use_mapping.keys()): 464 if k in use_positions and use_positions[k]: continue 465 print "#undef %s" % k 466for k,v in sorted(use_positions.items()): 467 if not v: continue 468 for suf in v.keys(): 469 tag = k + suf 470 print "#undef %s" % tag 471print 472print "/* == End of generated table == */" 473 474# Maintain at least 50% occupancy in the table */ 475if occupancy < 50: 476 raise Exception ("Table too sparse, please investigate: ", occupancy) 477