1# 2# (re)generate unicode property and type databases 3# 4# this script converts a unicode 3.2 database file to 5# Modules/unicodedata_db.h, Modules/unicodename_db.h, 6# and Objects/unicodetype_db.h 7# 8# history: 9# 2000-09-24 fl created (based on bits and pieces from unidb) 10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 11# 2000-09-25 fl added character type table 12# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 13# 2000-11-03 fl expand first/last ranges 14# 2001-01-19 fl added character name tables (2.1) 15# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 16# 2002-09-11 wd use string methods 17# 2002-10-18 mvl update to Unicode 3.2 18# 2002-10-22 mvl generate NFC tables 19# 2002-11-24 mvl expand all ranges, sort names version-independently 20# 2002-11-25 mvl add UNIDATA_VERSION 21# 2004-05-29 perky add east asian width information 22# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta 23# 24# written by Fredrik Lundh (fredrik@pythonware.com) 25# 26 27import sys 28 29SCRIPT = sys.argv[0] 30VERSION = "2.6" 31 32# The Unicode Database 33UNIDATA_VERSION = "5.2.0" 34UNICODE_DATA = "UnicodeData%s.txt" 35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 36EASTASIAN_WIDTH = "EastAsianWidth%s.txt" 37UNIHAN = "Unihan%s.txt" 38DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 39LINE_BREAK = "LineBreak%s.txt" 40 41old_versions = ["3.2.0"] 42 43CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 44 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 45 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 46 "So" ] 47 48BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 49 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 50 "ON" ] 51 52EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] 53 54MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 55 56# note: should match definitions in Objects/unicodectype.c 57ALPHA_MASK = 0x01 58DECIMAL_MASK = 0x02 59DIGIT_MASK = 0x04 60LOWER_MASK = 0x08 61LINEBREAK_MASK = 0x10 62SPACE_MASK = 0x20 63TITLE_MASK = 0x40 64UPPER_MASK = 0x80 65NODELTA_MASK = 0x100 66NUMERIC_MASK = 0x200 67 68def maketables(trace=0): 69 70 print "--- Reading", UNICODE_DATA % "", "..." 71 72 version = "" 73 unicode = UnicodeData(UNICODE_DATA % version, 74 COMPOSITION_EXCLUSIONS % version, 75 EASTASIAN_WIDTH % version, 76 UNIHAN % version, 77 DERIVEDNORMALIZATION_PROPS % version, 78 LINE_BREAK % version) 79 80 print len(filter(None, unicode.table)), "characters" 81 82 for version in old_versions: 83 print "--- Reading", UNICODE_DATA % ("-"+version), "..." 84 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), 85 COMPOSITION_EXCLUSIONS % ("-"+version), 86 EASTASIAN_WIDTH % ("-"+version), 87 UNIHAN % ("-"+version)) 88 print len(filter(None, old_unicode.table)), "characters" 89 merge_old_version(version, unicode, old_unicode) 90 91 makeunicodename(unicode, trace) 92 makeunicodedata(unicode, trace) 93 makeunicodetype(unicode, trace) 94 95# -------------------------------------------------------------------- 96# unicode character properties 97 98def makeunicodedata(unicode, trace): 99 100 dummy = (0, 0, 0, 0, 0, 0) 101 table = [dummy] 102 cache = {0: dummy} 103 index = [0] * len(unicode.chars) 104 105 FILE = "Modules/unicodedata_db.h" 106 107 print "--- Preparing", FILE, "..." 108 109 # 1) database properties 110 111 for char in unicode.chars: 112 record = unicode.table[char] 113 if record: 114 # extract database properties 115 category = CATEGORY_NAMES.index(record[2]) 116 combining = int(record[3]) 117 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) 118 mirrored = record[9] == "Y" 119 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) 120 normalizationquickcheck = record[17] 121 item = ( 122 category, combining, bidirectional, mirrored, eastasianwidth, 123 normalizationquickcheck 124 ) 125 # add entry to index and item tables 126 i = cache.get(item) 127 if i is None: 128 cache[item] = i = len(table) 129 table.append(item) 130 index[char] = i 131 132 # 2) decomposition data 133 134 decomp_data = [0] 135 decomp_prefix = [""] 136 decomp_index = [0] * len(unicode.chars) 137 decomp_size = 0 138 139 comp_pairs = [] 140 comp_first = [None] * len(unicode.chars) 141 comp_last = [None] * len(unicode.chars) 142 143 for char in unicode.chars: 144 record = unicode.table[char] 145 if record: 146 if record[5]: 147 decomp = record[5].split() 148 if len(decomp) > 19: 149 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char 150 # prefix 151 if decomp[0][0] == "<": 152 prefix = decomp.pop(0) 153 else: 154 prefix = "" 155 try: 156 i = decomp_prefix.index(prefix) 157 except ValueError: 158 i = len(decomp_prefix) 159 decomp_prefix.append(prefix) 160 prefix = i 161 assert prefix < 256 162 # content 163 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 164 # Collect NFC pairs 165 if not prefix and len(decomp) == 3 and \ 166 char not in unicode.exclusions and \ 167 unicode.table[decomp[1]][3] == "0": 168 p, l, r = decomp 169 comp_first[l] = 1 170 comp_last[r] = 1 171 comp_pairs.append((l,r,char)) 172 try: 173 i = decomp_data.index(decomp) 174 except ValueError: 175 i = len(decomp_data) 176 decomp_data.extend(decomp) 177 decomp_size = decomp_size + len(decomp) * 2 178 else: 179 i = 0 180 decomp_index[char] = i 181 182 f = l = 0 183 comp_first_ranges = [] 184 comp_last_ranges = [] 185 prev_f = prev_l = None 186 for i in unicode.chars: 187 if comp_first[i] is not None: 188 comp_first[i] = f 189 f += 1 190 if prev_f is None: 191 prev_f = (i,i) 192 elif prev_f[1]+1 == i: 193 prev_f = prev_f[0],i 194 else: 195 comp_first_ranges.append(prev_f) 196 prev_f = (i,i) 197 if comp_last[i] is not None: 198 comp_last[i] = l 199 l += 1 200 if prev_l is None: 201 prev_l = (i,i) 202 elif prev_l[1]+1 == i: 203 prev_l = prev_l[0],i 204 else: 205 comp_last_ranges.append(prev_l) 206 prev_l = (i,i) 207 comp_first_ranges.append(prev_f) 208 comp_last_ranges.append(prev_l) 209 total_first = f 210 total_last = l 211 212 comp_data = [0]*(total_first*total_last) 213 for f,l,char in comp_pairs: 214 f = comp_first[f] 215 l = comp_last[l] 216 comp_data[f*total_last+l] = char 217 218 print len(table), "unique properties" 219 print len(decomp_prefix), "unique decomposition prefixes" 220 print len(decomp_data), "unique decomposition entries:", 221 print decomp_size, "bytes" 222 print total_first, "first characters in NFC" 223 print total_last, "last characters in NFC" 224 print len(comp_pairs), "NFC pairs" 225 226 print "--- Writing", FILE, "..." 227 228 fp = open(FILE, "w") 229 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 230 print >>fp 231 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION 232 print >>fp, "/* a list of unique database records */" 233 print >>fp, \ 234 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" 235 for item in table: 236 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item 237 print >>fp, "};" 238 print >>fp 239 240 print >>fp, "/* Reindexing of NFC first characters. */" 241 print >>fp, "#define TOTAL_FIRST",total_first 242 print >>fp, "#define TOTAL_LAST",total_last 243 print >>fp, "struct reindex{int start;short count,index;};" 244 print >>fp, "static struct reindex nfc_first[] = {" 245 for start,end in comp_first_ranges: 246 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start]) 247 print >>fp," {0,0,0}" 248 print >>fp,"};\n" 249 print >>fp, "static struct reindex nfc_last[] = {" 250 for start,end in comp_last_ranges: 251 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start]) 252 print >>fp," {0,0,0}" 253 print >>fp,"};\n" 254 255 # FIXME: <fl> the following tables could be made static, and 256 # the support code moved into unicodedatabase.c 257 258 print >>fp, "/* string literals */" 259 print >>fp, "const char *_PyUnicode_CategoryNames[] = {" 260 for name in CATEGORY_NAMES: 261 print >>fp, " \"%s\"," % name 262 print >>fp, " NULL" 263 print >>fp, "};" 264 265 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {" 266 for name in BIDIRECTIONAL_NAMES: 267 print >>fp, " \"%s\"," % name 268 print >>fp, " NULL" 269 print >>fp, "};" 270 271 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {" 272 for name in EASTASIANWIDTH_NAMES: 273 print >>fp, " \"%s\"," % name 274 print >>fp, " NULL" 275 print >>fp, "};" 276 277 print >>fp, "static const char *decomp_prefix[] = {" 278 for name in decomp_prefix: 279 print >>fp, " \"%s\"," % name 280 print >>fp, " NULL" 281 print >>fp, "};" 282 283 # split record index table 284 index1, index2, shift = splitbins(index, trace) 285 286 print >>fp, "/* index tables for the database records */" 287 print >>fp, "#define SHIFT", shift 288 Array("index1", index1).dump(fp, trace) 289 Array("index2", index2).dump(fp, trace) 290 291 # split decomposition index table 292 index1, index2, shift = splitbins(decomp_index, trace) 293 294 print >>fp, "/* decomposition data */" 295 Array("decomp_data", decomp_data).dump(fp, trace) 296 297 print >>fp, "/* index tables for the decomposition data */" 298 print >>fp, "#define DECOMP_SHIFT", shift 299 Array("decomp_index1", index1).dump(fp, trace) 300 Array("decomp_index2", index2).dump(fp, trace) 301 302 index, index2, shift = splitbins(comp_data, trace) 303 print >>fp, "/* NFC pairs */" 304 print >>fp, "#define COMP_SHIFT", shift 305 Array("comp_index", index).dump(fp, trace) 306 Array("comp_data", index2).dump(fp, trace) 307 308 # Generate delta tables for old versions 309 for version, table, normalization in unicode.changed: 310 cversion = version.replace(".","_") 311 records = [table[0]] 312 cache = {table[0]:0} 313 index = [0] * len(table) 314 for i, record in enumerate(table): 315 try: 316 index[i] = cache[record] 317 except KeyError: 318 index[i] = cache[record] = len(records) 319 records.append(record) 320 index1, index2, shift = splitbins(index, trace) 321 print >>fp, "static const change_record change_records_%s[] = {" % cversion 322 for record in records: 323 print >>fp, "\t{ %s }," % ", ".join(map(str,record)) 324 print >>fp, "};" 325 Array("changes_%s_index" % cversion, index1).dump(fp, trace) 326 Array("changes_%s_data" % cversion, index2).dump(fp, trace) 327 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion 328 print >>fp, "{" 329 print >>fp, "\tint index;" 330 print >>fp, "\tif (n >= 0x110000) index = 0;" 331 print >>fp, "\telse {" 332 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift) 333 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \ 334 (cversion, shift, ((1<<shift)-1)) 335 print >>fp, "\t}" 336 print >>fp, "\treturn change_records_%s+index;" % cversion 337 print >>fp, "}\n" 338 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion 339 print >>fp, "{" 340 print >>fp, "\tswitch(n) {" 341 for k, v in normalization: 342 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v) 343 print >>fp, "\tdefault: return 0;" 344 print >>fp, "\t}\n}\n" 345 346 fp.close() 347 348# -------------------------------------------------------------------- 349# unicode character type tables 350 351def makeunicodetype(unicode, trace): 352 353 FILE = "Objects/unicodetype_db.h" 354 355 print "--- Preparing", FILE, "..." 356 357 # extract unicode types 358 dummy = (0, 0, 0, 0, 0, 0) 359 table = [dummy] 360 cache = {0: dummy} 361 index = [0] * len(unicode.chars) 362 numeric = {} 363 spaces = [] 364 linebreaks = [] 365 366 for char in unicode.chars: 367 record = unicode.table[char] 368 if record: 369 # extract database properties 370 category = record[2] 371 bidirectional = record[4] 372 properties = record[16] 373 flags = 0 374 delta = True 375 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 376 flags |= ALPHA_MASK 377 if category == "Ll": 378 flags |= LOWER_MASK 379 if 'Line_Break' in properties or bidirectional == "B": 380 flags |= LINEBREAK_MASK 381 linebreaks.append(char) 382 if category == "Zs" or bidirectional in ("WS", "B", "S"): 383 flags |= SPACE_MASK 384 spaces.append(char) 385 if category == "Lt": 386 flags |= TITLE_MASK 387 if category == "Lu": 388 flags |= UPPER_MASK 389 # use delta predictor for upper/lower/title if it fits 390 if record[12]: 391 upper = int(record[12], 16) 392 else: 393 upper = char 394 if record[13]: 395 lower = int(record[13], 16) 396 else: 397 lower = char 398 if record[14]: 399 title = int(record[14], 16) 400 else: 401 # UCD.html says that a missing title char means that 402 # it defaults to the uppercase character, not to the 403 # character itself. Apparently, in the current UCD (5.x) 404 # this feature is never used 405 title = upper 406 upper_d = upper - char 407 lower_d = lower - char 408 title_d = title - char 409 if -32768 <= upper_d <= 32767 and \ 410 -32768 <= lower_d <= 32767 and \ 411 -32768 <= title_d <= 32767: 412 # use deltas 413 upper = upper_d & 0xffff 414 lower = lower_d & 0xffff 415 title = title_d & 0xffff 416 else: 417 flags |= NODELTA_MASK 418 # decimal digit, integer digit 419 decimal = 0 420 if record[6]: 421 flags |= DECIMAL_MASK 422 decimal = int(record[6]) 423 digit = 0 424 if record[7]: 425 flags |= DIGIT_MASK 426 digit = int(record[7]) 427 if record[8]: 428 flags |= NUMERIC_MASK 429 numeric.setdefault(record[8], []).append(char) 430 item = ( 431 upper, lower, title, decimal, digit, flags 432 ) 433 # add entry to index and item tables 434 i = cache.get(item) 435 if i is None: 436 cache[item] = i = len(table) 437 table.append(item) 438 index[char] = i 439 440 print len(table), "unique character type entries" 441 print sum(map(len, numeric.values())), "numeric code points" 442 print len(spaces), "whitespace code points" 443 print len(linebreaks), "linebreak code points" 444 445 print "--- Writing", FILE, "..." 446 447 fp = open(FILE, "w") 448 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 449 print >>fp 450 print >>fp, "/* a list of unique character type descriptors */" 451 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {" 452 for item in table: 453 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item 454 print >>fp, "};" 455 print >>fp 456 457 # split decomposition index table 458 index1, index2, shift = splitbins(index, trace) 459 460 print >>fp, "/* type indexes */" 461 print >>fp, "#define SHIFT", shift 462 Array("index1", index1).dump(fp, trace) 463 Array("index2", index2).dump(fp, trace) 464 465 # Generate code for _PyUnicode_ToNumeric() 466 numeric_items = sorted(numeric.items()) 467 print >>fp, '/* Returns the numeric value as double for Unicode characters' 468 print >>fp, ' * having this property, -1.0 otherwise.' 469 print >>fp, ' */' 470 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)' 471 print >>fp, '{' 472 print >>fp, ' switch (ch) {' 473 for value, codepoints in numeric_items: 474 # Turn text into float literals 475 parts = value.split('/') 476 parts = [repr(float(part)) for part in parts] 477 value = '/'.join(parts) 478 479 haswide = False 480 hasnonewide = False 481 codepoints.sort() 482 for codepoint in codepoints: 483 if codepoint < 0x10000: 484 hasnonewide = True 485 if codepoint >= 0x10000 and not haswide: 486 print >>fp, '#ifdef Py_UNICODE_WIDE' 487 haswide = True 488 print >>fp, ' case 0x%04X:' % (codepoint,) 489 if haswide and hasnonewide: 490 print >>fp, '#endif' 491 print >>fp, ' return (double) %s;' % (value,) 492 if haswide and not hasnonewide: 493 print >>fp, '#endif' 494 print >>fp,' }' 495 print >>fp,' return -1.0;' 496 print >>fp,'}' 497 print >>fp 498 499 # Generate code for _PyUnicode_IsWhitespace() 500 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional" 501 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise." 502 print >>fp, " */" 503 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)' 504 print >>fp, '{' 505 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS' 506 print >>fp, ' return iswspace(ch);' 507 print >>fp, '#else' 508 print >>fp, ' switch (ch) {' 509 510 haswide = False 511 hasnonewide = False 512 for codepoint in sorted(spaces): 513 if codepoint < 0x10000: 514 hasnonewide = True 515 if codepoint >= 0x10000 and not haswide: 516 print >>fp, '#ifdef Py_UNICODE_WIDE' 517 haswide = True 518 print >>fp, ' case 0x%04X:' % (codepoint,) 519 if haswide and hasnonewide: 520 print >>fp, '#endif' 521 print >>fp, ' return 1;' 522 if haswide and not hasnonewide: 523 print >>fp, '#endif' 524 525 print >>fp,' }' 526 print >>fp,' return 0;' 527 print >>fp, '#endif' 528 print >>fp,'}' 529 print >>fp 530 531 # Generate code for _PyUnicode_IsLinebreak() 532 print >>fp, "/* Returns 1 for Unicode characters having the line break" 533 print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional" 534 print >>fp, " * type 'B', 0 otherwise." 535 print >>fp, " */" 536 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)' 537 print >>fp, '{' 538 print >>fp, ' switch (ch) {' 539 haswide = False 540 hasnonewide = False 541 for codepoint in sorted(linebreaks): 542 if codepoint < 0x10000: 543 hasnonewide = True 544 if codepoint >= 0x10000 and not haswide: 545 print >>fp, '#ifdef Py_UNICODE_WIDE' 546 haswide = True 547 print >>fp, ' case 0x%04X:' % (codepoint,) 548 if haswide and hasnonewide: 549 print >>fp, '#endif' 550 print >>fp, ' return 1;' 551 if haswide and not hasnonewide: 552 print >>fp, '#endif' 553 554 print >>fp,' }' 555 print >>fp,' return 0;' 556 print >>fp,'}' 557 print >>fp 558 559 fp.close() 560 561# -------------------------------------------------------------------- 562# unicode name database 563 564def makeunicodename(unicode, trace): 565 566 FILE = "Modules/unicodename_db.h" 567 568 print "--- Preparing", FILE, "..." 569 570 # collect names 571 names = [None] * len(unicode.chars) 572 573 for char in unicode.chars: 574 record = unicode.table[char] 575 if record: 576 name = record[1].strip() 577 if name and name[0] != "<": 578 names[char] = name + chr(0) 579 580 print len(filter(lambda n: n is not None, names)), "distinct names" 581 582 # collect unique words from names (note that we differ between 583 # words inside a sentence, and words ending a sentence. the 584 # latter includes the trailing null byte. 585 586 words = {} 587 n = b = 0 588 for char in unicode.chars: 589 name = names[char] 590 if name: 591 w = name.split() 592 b = b + len(name) 593 n = n + len(w) 594 for w in w: 595 l = words.get(w) 596 if l: 597 l.append(None) 598 else: 599 words[w] = [len(words)] 600 601 print n, "words in text;", b, "bytes" 602 603 wordlist = words.items() 604 605 # sort on falling frequency, then by name 606 def word_key(a): 607 aword, alist = a 608 return -len(alist), aword 609 wordlist.sort(key=word_key) 610 611 # figure out how many phrasebook escapes we need 612 escapes = 0 613 while escapes * 256 < len(wordlist): 614 escapes = escapes + 1 615 print escapes, "escapes" 616 617 short = 256 - escapes 618 619 assert short > 0 620 621 print short, "short indexes in lexicon" 622 623 # statistics 624 n = 0 625 for i in range(short): 626 n = n + len(wordlist[i][1]) 627 print n, "short indexes in phrasebook" 628 629 # pick the most commonly used words, and sort the rest on falling 630 # length (to maximize overlap) 631 632 wordlist, wordtail = wordlist[:short], wordlist[short:] 633 wordtail.sort(key=lambda a: a[0], reverse=True) 634 wordlist.extend(wordtail) 635 636 # generate lexicon from words 637 638 lexicon_offset = [0] 639 lexicon = "" 640 words = {} 641 642 # build a lexicon string 643 offset = 0 644 for w, x in wordlist: 645 # encoding: bit 7 indicates last character in word (chr(128) 646 # indicates the last character in an entire string) 647 ww = w[:-1] + chr(ord(w[-1])+128) 648 # reuse string tails, when possible 649 o = lexicon.find(ww) 650 if o < 0: 651 o = offset 652 lexicon = lexicon + ww 653 offset = offset + len(w) 654 words[w] = len(lexicon_offset) 655 lexicon_offset.append(o) 656 657 lexicon = map(ord, lexicon) 658 659 # generate phrasebook from names and lexicon 660 phrasebook = [0] 661 phrasebook_offset = [0] * len(unicode.chars) 662 for char in unicode.chars: 663 name = names[char] 664 if name: 665 w = name.split() 666 phrasebook_offset[char] = len(phrasebook) 667 for w in w: 668 i = words[w] 669 if i < short: 670 phrasebook.append(i) 671 else: 672 # store as two bytes 673 phrasebook.append((i>>8) + short) 674 phrasebook.append(i&255) 675 676 assert getsize(phrasebook) == 1 677 678 # 679 # unicode name hash table 680 681 # extract names 682 data = [] 683 for char in unicode.chars: 684 record = unicode.table[char] 685 if record: 686 name = record[1].strip() 687 if name and name[0] != "<": 688 data.append((name, char)) 689 690 # the magic number 47 was chosen to minimize the number of 691 # collisions on the current data set. if you like, change it 692 # and see what happens... 693 694 codehash = Hash("code", data, 47) 695 696 print "--- Writing", FILE, "..." 697 698 fp = open(FILE, "w") 699 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 700 print >>fp 701 print >>fp, "#define NAME_MAXLEN", 256 702 print >>fp 703 print >>fp, "/* lexicon */" 704 Array("lexicon", lexicon).dump(fp, trace) 705 Array("lexicon_offset", lexicon_offset).dump(fp, trace) 706 707 # split decomposition index table 708 offset1, offset2, shift = splitbins(phrasebook_offset, trace) 709 710 print >>fp, "/* code->name phrasebook */" 711 print >>fp, "#define phrasebook_shift", shift 712 print >>fp, "#define phrasebook_short", short 713 714 Array("phrasebook", phrasebook).dump(fp, trace) 715 Array("phrasebook_offset1", offset1).dump(fp, trace) 716 Array("phrasebook_offset2", offset2).dump(fp, trace) 717 718 print >>fp, "/* name->code dictionary */" 719 codehash.dump(fp, trace) 720 721 fp.close() 722 723 724def merge_old_version(version, new, old): 725 # Changes to exclusion file not implemented yet 726 if old.exclusions != new.exclusions: 727 raise NotImplementedError, "exclusions differ" 728 729 # In these change records, 0xFF means "no change" 730 bidir_changes = [0xFF]*0x110000 731 category_changes = [0xFF]*0x110000 732 decimal_changes = [0xFF]*0x110000 733 mirrored_changes = [0xFF]*0x110000 734 # In numeric data, 0 means "no change", 735 # -1 means "did not have a numeric value 736 numeric_changes = [0] * 0x110000 737 # normalization_changes is a list of key-value pairs 738 normalization_changes = [] 739 for i in range(0x110000): 740 if new.table[i] is None: 741 # Characters unassigned in the new version ought to 742 # be unassigned in the old one 743 assert old.table[i] is None 744 continue 745 # check characters unassigned in the old version 746 if old.table[i] is None: 747 # category 0 is "unassigned" 748 category_changes[i] = 0 749 continue 750 # check characters that differ 751 if old.table[i] != new.table[i]: 752 for k in range(len(old.table[i])): 753 if old.table[i][k] != new.table[i][k]: 754 value = old.table[i][k] 755 if k == 2: 756 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] 757 category_changes[i] = CATEGORY_NAMES.index(value) 758 elif k == 4: 759 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k] 760 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) 761 elif k == 5: 762 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k] 763 # We assume that all normalization changes are in 1:1 mappings 764 assert " " not in value 765 normalization_changes.append((i, value)) 766 elif k == 6: 767 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k] 768 # we only support changes where the old value is a single digit 769 assert value in "0123456789" 770 decimal_changes[i] = int(value) 771 elif k == 8: 772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] 773 # Since 0 encodes "no change", the old value is better not 0 774 if not value: 775 numeric_changes[i] = -1 776 else: 777 numeric_changes[i] = float(value) 778 assert numeric_changes[i] not in (0, -1) 779 elif k == 9: 780 if value == 'Y': 781 mirrored_changes[i] = '1' 782 else: 783 mirrored_changes[i] = '0' 784 elif k == 11: 785 # change to ISO comment, ignore 786 pass 787 elif k == 12: 788 # change to simple uppercase mapping; ignore 789 pass 790 elif k == 13: 791 # change to simple lowercase mapping; ignore 792 pass 793 elif k == 14: 794 # change to simple titlecase mapping; ignore 795 pass 796 elif k == 16: 797 # change to properties; not yet 798 pass 799 else: 800 class Difference(Exception):pass 801 raise Difference, (hex(i), k, old.table[i], new.table[i]) 802 new.changed.append((version, zip(bidir_changes, category_changes, 803 decimal_changes, mirrored_changes, 804 numeric_changes), 805 normalization_changes)) 806 807 808# -------------------------------------------------------------------- 809# the following support code is taken from the unidb utilities 810# Copyright (c) 1999-2000 by Secret Labs AB 811 812# load a unicode-data file from disk 813 814class UnicodeData: 815 # Record structure: 816 # [ID, name, category, combining, bidi, decomp, (6) 817 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) 818 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) 819 # properties] (17) 820 821 def __init__(self, filename, exclusions, eastasianwidth, unihan, 822 derivednormalizationprops=None, linebreakprops=None, 823 expand=1): 824 self.changed = [] 825 file = open(filename) 826 table = [None] * 0x110000 827 while 1: 828 s = file.readline() 829 if not s: 830 break 831 s = s.strip().split(";") 832 char = int(s[0], 16) 833 table[char] = s 834 835 # expand first-last ranges 836 if expand: 837 field = None 838 for i in range(0, 0x110000): 839 s = table[i] 840 if s: 841 if s[1][-6:] == "First>": 842 s[1] = "" 843 field = s 844 elif s[1][-5:] == "Last>": 845 s[1] = "" 846 field = None 847 elif field: 848 f2 = field[:] 849 f2[0] = "%X" % i 850 table[i] = f2 851 852 # public attributes 853 self.filename = filename 854 self.table = table 855 self.chars = range(0x110000) # unicode 3.2 856 857 file = open(exclusions) 858 self.exclusions = {} 859 for s in file: 860 s = s.strip() 861 if not s: 862 continue 863 if s[0] == '#': 864 continue 865 char = int(s.split()[0],16) 866 self.exclusions[char] = 1 867 868 widths = [None] * 0x110000 869 for s in open(eastasianwidth): 870 s = s.strip() 871 if not s: 872 continue 873 if s[0] == '#': 874 continue 875 s = s.split()[0].split(';') 876 if '..' in s[0]: 877 first, last = [int(c, 16) for c in s[0].split('..')] 878 chars = range(first, last+1) 879 else: 880 chars = [int(s[0], 16)] 881 for char in chars: 882 widths[char] = s[1] 883 for i in range(0, 0x110000): 884 if table[i] is not None: 885 table[i].append(widths[i]) 886 887 for i in range(0, 0x110000): 888 if table[i] is not None: 889 table[i].append(set()) 890 if linebreakprops: 891 for s in open(linebreakprops): 892 s = s.partition('#')[0] 893 s = [i.strip() for i in s.split(';')] 894 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: 895 continue 896 if '..' not in s[0]: 897 first = last = int(s[0], 16) 898 else: 899 first, last = [int(c, 16) for c in s[0].split('..')] 900 for char in range(first, last+1): 901 table[char][-1].add('Line_Break') 902 903 if derivednormalizationprops: 904 quickchecks = [0] * 0x110000 # default is Yes 905 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 906 for s in open(derivednormalizationprops): 907 if '#' in s: 908 s = s[:s.index('#')] 909 s = [i.strip() for i in s.split(';')] 910 if len(s) < 2 or s[1] not in qc_order: 911 continue 912 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 913 quickcheck_shift = qc_order.index(s[1])*2 914 quickcheck <<= quickcheck_shift 915 if '..' not in s[0]: 916 first = last = int(s[0], 16) 917 else: 918 first, last = [int(c, 16) for c in s[0].split('..')] 919 for char in range(first, last+1): 920 assert not (quickchecks[char]>>quickcheck_shift)&3 921 quickchecks[char] |= quickcheck 922 for i in range(0, 0x110000): 923 if table[i] is not None: 924 table[i].append(quickchecks[i]) 925 926 for line in open(unihan): 927 if not line.startswith('U+'): 928 continue 929 code, tag, value = line.split(None, 3)[:3] 930 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 931 'kOtherNumeric'): 932 continue 933 value = value.strip().replace(',', '') 934 i = int(code[2:], 16) 935 # Patch the numeric field 936 if table[i] is not None: 937 table[i][8] = value 938 939 def uselatin1(self): 940 # restrict character range to ISO Latin 1 941 self.chars = range(256) 942 943# hash table tools 944 945# this is a straight-forward reimplementation of Python's built-in 946# dictionary type, using a static data structure, and a custom string 947# hash algorithm. 948 949def myhash(s, magic): 950 h = 0 951 for c in map(ord, s.upper()): 952 h = (h * magic) + c 953 ix = h & 0xff000000L 954 if ix: 955 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff 956 return h 957 958SIZES = [ 959 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), 960 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), 961 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), 962 (2097152,5), (4194304,3), (8388608,33), (16777216,27) 963] 964 965class Hash: 966 def __init__(self, name, data, magic): 967 # turn a (key, value) list into a static hash table structure 968 969 # determine table size 970 for size, poly in SIZES: 971 if size > len(data): 972 poly = size + poly 973 break 974 else: 975 raise AssertionError, "ran out of polynominals" 976 977 print size, "slots in hash table" 978 979 table = [None] * size 980 981 mask = size-1 982 983 n = 0 984 985 hash = myhash 986 987 # initialize hash table 988 for key, value in data: 989 h = hash(key, magic) 990 i = (~h) & mask 991 v = table[i] 992 if v is None: 993 table[i] = value 994 continue 995 incr = (h ^ (h >> 3)) & mask; 996 if not incr: 997 incr = mask 998 while 1: 999 n = n + 1 1000 i = (i + incr) & mask 1001 v = table[i] 1002 if v is None: 1003 table[i] = value 1004 break 1005 incr = incr << 1 1006 if incr > mask: 1007 incr = incr ^ poly 1008 1009 print n, "collisions" 1010 self.collisions = n 1011 1012 for i in range(len(table)): 1013 if table[i] is None: 1014 table[i] = 0 1015 1016 self.data = Array(name + "_hash", table) 1017 self.magic = magic 1018 self.name = name 1019 self.size = size 1020 self.poly = poly 1021 1022 def dump(self, file, trace): 1023 # write data to file, as a C array 1024 self.data.dump(file, trace) 1025 file.write("#define %s_magic %d\n" % (self.name, self.magic)) 1026 file.write("#define %s_size %d\n" % (self.name, self.size)) 1027 file.write("#define %s_poly %d\n" % (self.name, self.poly)) 1028 1029# stuff to deal with arrays of unsigned integers 1030 1031class Array: 1032 1033 def __init__(self, name, data): 1034 self.name = name 1035 self.data = data 1036 1037 def dump(self, file, trace=0): 1038 # write data to file, as a C array 1039 size = getsize(self.data) 1040 if trace: 1041 print >>sys.stderr, self.name+":", size*len(self.data), "bytes" 1042 file.write("static ") 1043 if size == 1: 1044 file.write("unsigned char") 1045 elif size == 2: 1046 file.write("unsigned short") 1047 else: 1048 file.write("unsigned int") 1049 file.write(" " + self.name + "[] = {\n") 1050 if self.data: 1051 s = " " 1052 for item in self.data: 1053 i = str(item) + ", " 1054 if len(s) + len(i) > 78: 1055 file.write(s + "\n") 1056 s = " " + i 1057 else: 1058 s = s + i 1059 if s.strip(): 1060 file.write(s + "\n") 1061 file.write("};\n\n") 1062 1063def getsize(data): 1064 # return smallest possible integer size for the given array 1065 maxdata = max(data) 1066 if maxdata < 256: 1067 return 1 1068 elif maxdata < 65536: 1069 return 2 1070 else: 1071 return 4 1072 1073def splitbins(t, trace=0): 1074 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 1075 1076 t is a sequence of ints. This function can be useful to save space if 1077 many of the ints are the same. t1 and t2 are lists of ints, and shift 1078 is an int, chosen to minimize the combined size of t1 and t2 (in C 1079 code), and where for each i in range(len(t)), 1080 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1081 where mask is a bitmask isolating the last "shift" bits. 1082 1083 If optional arg trace is non-zero (default zero), progress info 1084 is printed to sys.stderr. The higher the value, the more info 1085 you'll get. 1086 """ 1087 1088 if trace: 1089 def dump(t1, t2, shift, bytes): 1090 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( 1091 len(t1), len(t2), shift, bytes) 1092 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \ 1093 "bytes" 1094 n = len(t)-1 # last valid index 1095 maxshift = 0 # the most we can shift n and still have something left 1096 if n > 0: 1097 while n >> 1: 1098 n >>= 1 1099 maxshift += 1 1100 del n 1101 bytes = sys.maxint # smallest total size so far 1102 t = tuple(t) # so slices can be dict keys 1103 for shift in range(maxshift + 1): 1104 t1 = [] 1105 t2 = [] 1106 size = 2**shift 1107 bincache = {} 1108 for i in range(0, len(t), size): 1109 bin = t[i:i+size] 1110 index = bincache.get(bin) 1111 if index is None: 1112 index = len(t2) 1113 bincache[bin] = index 1114 t2.extend(bin) 1115 t1.append(index >> shift) 1116 # determine memory size 1117 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 1118 if trace > 1: 1119 dump(t1, t2, shift, b) 1120 if b < bytes: 1121 best = t1, t2, shift 1122 bytes = b 1123 t1, t2, shift = best 1124 if trace: 1125 print >>sys.stderr, "Best:", 1126 dump(t1, t2, shift, bytes) 1127 if __debug__: 1128 # exhaustively verify that the decomposition is correct 1129 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 1130 for i in xrange(len(t)): 1131 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1132 return best 1133 1134if __name__ == "__main__": 1135 maketables(1) 1136