makeunicodedata.py revision b5c980b8028af0876e40f71c94b82cd02e327bf0
1# 2# (re)generate unicode property and type databases 3# 4# this script converts a unicode 3.2 database file to 5# Modules/unicodedata_db.h, Modules/unicodename_db.h, 6# and Objects/unicodetype_db.h 7# 8# history: 9# 2000-09-24 fl created (based on bits and pieces from unidb) 10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 11# 2000-09-25 fl added character type table 12# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 13# 2000-11-03 fl expand first/last ranges 14# 2001-01-19 fl added character name tables (2.1) 15# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 16# 2002-09-11 wd use string methods 17# 2002-10-18 mvl update to Unicode 3.2 18# 2002-10-22 mvl generate NFC tables 19# 2002-11-24 mvl expand all ranges, sort names version-independently 20# 2002-11-25 mvl add UNIDATA_VERSION 21# 22# written by Fredrik Lundh (fredrik@pythonware.com) 23# 24 25import sys 26 27SCRIPT = sys.argv[0] 28VERSION = "2.2" 29 30# The Unicode Database 31UNIDATA_VERSION = "3.2.0" 32UNICODE_DATA = "UnicodeData.txt" 33COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" 34 35CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 36 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 37 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 38 "So" ] 39 40BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 41 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 42 "ON" ] 43 44# note: should match definitions in Objects/unicodectype.c 45ALPHA_MASK = 0x01 46DECIMAL_MASK = 0x02 47DIGIT_MASK = 0x04 48LOWER_MASK = 0x08 49LINEBREAK_MASK = 0x10 50SPACE_MASK = 0x20 51TITLE_MASK = 0x40 52UPPER_MASK = 0x80 53 54def maketables(trace=0): 55 56 print "--- Reading", UNICODE_DATA, "..." 57 58 unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS) 59 60 print len(filter(None, unicode.table)), "characters" 61 62 makeunicodename(unicode, trace) 63 makeunicodedata(unicode, trace) 64 makeunicodetype(unicode, trace) 65 66# -------------------------------------------------------------------- 67# unicode character properties 68 69def makeunicodedata(unicode, trace): 70 71 dummy = (0, 0, 0, 0) 72 table = [dummy] 73 cache = {0: dummy} 74 index = [0] * len(unicode.chars) 75 76 FILE = "Modules/unicodedata_db.h" 77 78 print "--- Preparing", FILE, "..." 79 80 # 1) database properties 81 82 for char in unicode.chars: 83 record = unicode.table[char] 84 if record: 85 # extract database properties 86 category = CATEGORY_NAMES.index(record[2]) 87 combining = int(record[3]) 88 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) 89 mirrored = record[9] == "Y" 90 item = ( 91 category, combining, bidirectional, mirrored 92 ) 93 # add entry to index and item tables 94 i = cache.get(item) 95 if i is None: 96 cache[item] = i = len(table) 97 table.append(item) 98 index[char] = i 99 100 # 2) decomposition data 101 102 decomp_data = [0] 103 decomp_prefix = [""] 104 decomp_index = [0] * len(unicode.chars) 105 decomp_size = 0 106 107 comp_pairs = [] 108 comp_first = [None] * len(unicode.chars) 109 comp_last = [None] * len(unicode.chars) 110 111 for char in unicode.chars: 112 record = unicode.table[char] 113 if record: 114 if record[5]: 115 decomp = record[5].split() 116 # prefix 117 if decomp[0][0] == "<": 118 prefix = decomp.pop(0) 119 else: 120 prefix = "" 121 try: 122 i = decomp_prefix.index(prefix) 123 except ValueError: 124 i = len(decomp_prefix) 125 decomp_prefix.append(prefix) 126 prefix = i 127 assert prefix < 256 128 # content 129 decomp = [prefix + (len(decomp)<<8)] +\ 130 map(lambda s: int(s, 16), decomp) 131 # Collect NFC pairs 132 if not prefix and len(decomp) == 3 and \ 133 char not in unicode.exclusions and \ 134 unicode.table[decomp[1]][3] == "0": 135 p, l, r = decomp 136 comp_first[l] = 1 137 comp_last[r] = 1 138 comp_pairs.append((l,r,char)) 139 try: 140 i = decomp_data.index(decomp) 141 except ValueError: 142 i = len(decomp_data) 143 decomp_data.extend(decomp) 144 decomp_size = decomp_size + len(decomp) * 2 145 else: 146 i = 0 147 decomp_index[char] = i 148 149 f = l = 0 150 comp_first_ranges = [] 151 comp_last_ranges = [] 152 prev_f = prev_l = None 153 for i in unicode.chars: 154 if comp_first[i] is not None: 155 comp_first[i] = f 156 f += 1 157 if prev_f is None: 158 prev_f = (i,i) 159 elif prev_f[1]+1 == i: 160 prev_f = prev_f[0],i 161 else: 162 comp_first_ranges.append(prev_f) 163 prev_f = (i,i) 164 if comp_last[i] is not None: 165 comp_last[i] = l 166 l += 1 167 if prev_l is None: 168 prev_l = (i,i) 169 elif prev_l[1]+1 == i: 170 prev_l = prev_l[0],i 171 else: 172 comp_last_ranges.append(prev_l) 173 prev_l = (i,i) 174 comp_first_ranges.append(prev_f) 175 comp_last_ranges.append(prev_l) 176 total_first = f 177 total_last = l 178 179 comp_data = [0]*(total_first*total_last) 180 for f,l,char in comp_pairs: 181 f = comp_first[f] 182 l = comp_last[l] 183 comp_data[f*total_last+l] = char 184 185 print len(table), "unique properties" 186 print len(decomp_prefix), "unique decomposition prefixes" 187 print len(decomp_data), "unique decomposition entries:", 188 print decomp_size, "bytes" 189 print total_first, "first characters in NFC" 190 print total_last, "last characters in NFC" 191 print len(comp_pairs), "NFC pairs" 192 193 print "--- Writing", FILE, "..." 194 195 fp = open(FILE, "w") 196 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 197 print >>fp 198 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION 199 print >>fp, "/* a list of unique database records */" 200 print >>fp, \ 201 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" 202 for item in table: 203 print >>fp, " {%d, %d, %d, %d}," % item 204 print >>fp, "};" 205 print >>fp 206 207 print >>fp, "/* Reindexing of NFC first characters. */" 208 print >>fp, "#define TOTAL_FIRST",total_first 209 print >>fp, "#define TOTAL_LAST",total_last 210 print >>fp, "struct reindex{int start;short count,index;};" 211 print >>fp, "struct reindex nfc_first[] = {" 212 for start,end in comp_first_ranges: 213 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start]) 214 print >>fp," {0,0,0}" 215 print >>fp,"};\n" 216 print >>fp, "struct reindex nfc_last[] = {" 217 for start,end in comp_last_ranges: 218 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start]) 219 print >>fp," {0,0,0}" 220 print >>fp,"};\n" 221 222 # FIXME: <fl> the following tables could be made static, and 223 # the support code moved into unicodedatabase.c 224 225 print >>fp, "/* string literals */" 226 print >>fp, "const char *_PyUnicode_CategoryNames[] = {" 227 for name in CATEGORY_NAMES: 228 print >>fp, " \"%s\"," % name 229 print >>fp, " NULL" 230 print >>fp, "};" 231 232 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {" 233 for name in BIDIRECTIONAL_NAMES: 234 print >>fp, " \"%s\"," % name 235 print >>fp, " NULL" 236 print >>fp, "};" 237 238 print >>fp, "static const char *decomp_prefix[] = {" 239 for name in decomp_prefix: 240 print >>fp, " \"%s\"," % name 241 print >>fp, " NULL" 242 print >>fp, "};" 243 244 # split record index table 245 index1, index2, shift = splitbins(index, trace) 246 247 print >>fp, "/* index tables for the database records */" 248 print >>fp, "#define SHIFT", shift 249 Array("index1", index1).dump(fp, trace) 250 Array("index2", index2).dump(fp, trace) 251 252 # split decomposition index table 253 index1, index2, shift = splitbins(decomp_index, trace) 254 255 print >>fp, "/* decomposition data */" 256 Array("decomp_data", decomp_data).dump(fp, trace) 257 258 print >>fp, "/* index tables for the decomposition data */" 259 print >>fp, "#define DECOMP_SHIFT", shift 260 Array("decomp_index1", index1).dump(fp, trace) 261 Array("decomp_index2", index2).dump(fp, trace) 262 263 index, index2, shift = splitbins(comp_data, trace) 264 print >>fp, "/* NFC pairs */" 265 print >>fp, "#define COMP_SHIFT", shift 266 Array("comp_index", index).dump(fp, trace) 267 Array("comp_data", index2).dump(fp, trace) 268 269 fp.close() 270 271# -------------------------------------------------------------------- 272# unicode character type tables 273 274def makeunicodetype(unicode, trace): 275 276 FILE = "Objects/unicodetype_db.h" 277 278 print "--- Preparing", FILE, "..." 279 280 # extract unicode types 281 dummy = (0, 0, 0, 0, 0, 0) 282 table = [dummy] 283 cache = {0: dummy} 284 index = [0] * len(unicode.chars) 285 286 for char in unicode.chars: 287 record = unicode.table[char] 288 if record: 289 # extract database properties 290 category = record[2] 291 bidirectional = record[4] 292 flags = 0 293 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 294 flags |= ALPHA_MASK 295 if category == "Ll": 296 flags |= LOWER_MASK 297 if category == "Zl" or bidirectional == "B": 298 flags |= LINEBREAK_MASK 299 if category == "Zs" or bidirectional in ("WS", "B", "S"): 300 flags |= SPACE_MASK 301 if category == "Lt": 302 flags |= TITLE_MASK 303 if category == "Lu": 304 flags |= UPPER_MASK 305 # use delta predictor for upper/lower/title 306 if record[12]: 307 upper = int(record[12], 16) - char 308 assert -32768 <= upper <= 32767 309 upper = upper & 0xffff 310 else: 311 upper = 0 312 if record[13]: 313 lower = int(record[13], 16) - char 314 assert -32768 <= lower <= 32767 315 lower = lower & 0xffff 316 else: 317 lower = 0 318 if record[14]: 319 title = int(record[14], 16) - char 320 assert -32768 <= lower <= 32767 321 title = title & 0xffff 322 else: 323 title = 0 324 # decimal digit, integer digit 325 decimal = 0 326 if record[6]: 327 flags |= DECIMAL_MASK 328 decimal = int(record[6]) 329 digit = 0 330 if record[7]: 331 flags |= DIGIT_MASK 332 digit = int(record[7]) 333 item = ( 334 flags, upper, lower, title, decimal, digit 335 ) 336 # add entry to index and item tables 337 i = cache.get(item) 338 if i is None: 339 cache[item] = i = len(table) 340 table.append(item) 341 index[char] = i 342 343 print len(table), "unique character type entries" 344 345 print "--- Writing", FILE, "..." 346 347 fp = open(FILE, "w") 348 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 349 print >>fp 350 print >>fp, "/* a list of unique character type descriptors */" 351 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {" 352 for item in table: 353 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item 354 print >>fp, "};" 355 print >>fp 356 357 # split decomposition index table 358 index1, index2, shift = splitbins(index, trace) 359 360 print >>fp, "/* type indexes */" 361 print >>fp, "#define SHIFT", shift 362 Array("index1", index1).dump(fp, trace) 363 Array("index2", index2).dump(fp, trace) 364 365 fp.close() 366 367# -------------------------------------------------------------------- 368# unicode name database 369 370def makeunicodename(unicode, trace): 371 372 FILE = "Modules/unicodename_db.h" 373 374 print "--- Preparing", FILE, "..." 375 376 # collect names 377 names = [None] * len(unicode.chars) 378 379 for char in unicode.chars: 380 record = unicode.table[char] 381 if record: 382 name = record[1].strip() 383 if name and name[0] != "<": 384 names[char] = name + chr(0) 385 386 print len(filter(lambda n: n is not None, names)), "distinct names" 387 388 # collect unique words from names (note that we differ between 389 # words inside a sentence, and words ending a sentence. the 390 # latter includes the trailing null byte. 391 392 words = {} 393 n = b = 0 394 for char in unicode.chars: 395 name = names[char] 396 if name: 397 w = name.split() 398 b = b + len(name) 399 n = n + len(w) 400 for w in w: 401 l = words.get(w) 402 if l: 403 l.append(None) 404 else: 405 words[w] = [len(words)] 406 407 print n, "words in text;", b, "bytes" 408 409 wordlist = words.items() 410 411 # sort on falling frequency, then by name 412 def cmpwords((aword, alist),(bword, blist)): 413 r = -cmp(len(alist),len(blist)) 414 if r: 415 return r 416 return cmp(aword, bword) 417 wordlist.sort(cmpwords) 418 419 # figure out how many phrasebook escapes we need 420 escapes = 0 421 while escapes * 256 < len(wordlist): 422 escapes = escapes + 1 423 print escapes, "escapes" 424 425 short = 256 - escapes 426 427 assert short > 0 428 429 print short, "short indexes in lexicon" 430 431 # statistics 432 n = 0 433 for i in range(short): 434 n = n + len(wordlist[i][1]) 435 print n, "short indexes in phrasebook" 436 437 # pick the most commonly used words, and sort the rest on falling 438 # length (to maximize overlap) 439 440 wordlist, wordtail = wordlist[:short], wordlist[short:] 441 wordtail.sort(lambda a, b: len(b[0])-len(a[0])) 442 wordlist.extend(wordtail) 443 444 # generate lexicon from words 445 446 lexicon_offset = [0] 447 lexicon = "" 448 words = {} 449 450 # build a lexicon string 451 offset = 0 452 for w, x in wordlist: 453 # encoding: bit 7 indicates last character in word (chr(128) 454 # indicates the last character in an entire string) 455 ww = w[:-1] + chr(ord(w[-1])+128) 456 # reuse string tails, when possible 457 o = lexicon.find(ww) 458 if o < 0: 459 o = offset 460 lexicon = lexicon + ww 461 offset = offset + len(w) 462 words[w] = len(lexicon_offset) 463 lexicon_offset.append(o) 464 465 lexicon = map(ord, lexicon) 466 467 # generate phrasebook from names and lexicon 468 phrasebook = [0] 469 phrasebook_offset = [0] * len(unicode.chars) 470 for char in unicode.chars: 471 name = names[char] 472 if name: 473 w = name.split() 474 phrasebook_offset[char] = len(phrasebook) 475 for w in w: 476 i = words[w] 477 if i < short: 478 phrasebook.append(i) 479 else: 480 # store as two bytes 481 phrasebook.append((i>>8) + short) 482 phrasebook.append(i&255) 483 484 assert getsize(phrasebook) == 1 485 486 # 487 # unicode name hash table 488 489 # extract names 490 data = [] 491 for char in unicode.chars: 492 record = unicode.table[char] 493 if record: 494 name = record[1].strip() 495 if name and name[0] != "<": 496 data.append((name, char)) 497 498 # the magic number 47 was chosen to minimize the number of 499 # collisions on the current data set. if you like, change it 500 # and see what happens... 501 502 codehash = Hash("code", data, 47) 503 504 print "--- Writing", FILE, "..." 505 506 fp = open(FILE, "w") 507 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 508 print >>fp 509 print >>fp, "#define NAME_MAXLEN", 256 510 print >>fp 511 print >>fp, "/* lexicon */" 512 Array("lexicon", lexicon).dump(fp, trace) 513 Array("lexicon_offset", lexicon_offset).dump(fp, trace) 514 515 # split decomposition index table 516 offset1, offset2, shift = splitbins(phrasebook_offset, trace) 517 518 print >>fp, "/* code->name phrasebook */" 519 print >>fp, "#define phrasebook_shift", shift 520 print >>fp, "#define phrasebook_short", short 521 522 Array("phrasebook", phrasebook).dump(fp, trace) 523 Array("phrasebook_offset1", offset1).dump(fp, trace) 524 Array("phrasebook_offset2", offset2).dump(fp, trace) 525 526 print >>fp, "/* name->code dictionary */" 527 codehash.dump(fp, trace) 528 529 fp.close() 530 531# -------------------------------------------------------------------- 532# the following support code is taken from the unidb utilities 533# Copyright (c) 1999-2000 by Secret Labs AB 534 535# load a unicode-data file from disk 536 537import sys 538 539class UnicodeData: 540 541 def __init__(self, filename, exclusions, expand=1): 542 file = open(filename) 543 table = [None] * 0x110000 544 while 1: 545 s = file.readline() 546 if not s: 547 break 548 s = s.strip().split(";") 549 char = int(s[0], 16) 550 table[char] = s 551 552 # expand first-last ranges 553 if expand: 554 field = None 555 for i in range(0, 0x110000): 556 s = table[i] 557 if s: 558 if s[1][-6:] == "First>": 559 s[1] = "" 560 field = s[:] 561 elif s[1][-5:] == "Last>": 562 s[1] = "" 563 field = None 564 elif field: 565 field[0] = hex(i) 566 table[i] = field 567 568 # public attributes 569 self.filename = filename 570 self.table = table 571 self.chars = range(0x110000) # unicode 3.2 572 573 file = open(exclusions) 574 self.exclusions = {} 575 for s in file: 576 s = s.strip() 577 if not s: 578 continue 579 if s[0] == '#': 580 continue 581 char = int(s.split()[0],16) 582 self.exclusions[char] = 1 583 584 def uselatin1(self): 585 # restrict character range to ISO Latin 1 586 self.chars = range(256) 587 588# hash table tools 589 590# this is a straight-forward reimplementation of Python's built-in 591# dictionary type, using a static data structure, and a custom string 592# hash algorithm. 593 594def myhash(s, magic): 595 h = 0 596 for c in map(ord, s.upper()): 597 h = (h * magic) + c 598 ix = h & 0xff000000L 599 if ix: 600 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff 601 return h 602 603SIZES = [ 604 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), 605 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), 606 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), 607 (2097152,5), (4194304,3), (8388608,33), (16777216,27) 608] 609 610class Hash: 611 def __init__(self, name, data, magic): 612 # turn a (key, value) list into a static hash table structure 613 614 # determine table size 615 for size, poly in SIZES: 616 if size > len(data): 617 poly = size + poly 618 break 619 else: 620 raise AssertionError, "ran out of polynominals" 621 622 print size, "slots in hash table" 623 624 table = [None] * size 625 626 mask = size-1 627 628 n = 0 629 630 hash = myhash 631 632 # initialize hash table 633 for key, value in data: 634 h = hash(key, magic) 635 i = (~h) & mask 636 v = table[i] 637 if v is None: 638 table[i] = value 639 continue 640 incr = (h ^ (h >> 3)) & mask; 641 if not incr: 642 incr = mask 643 while 1: 644 n = n + 1 645 i = (i + incr) & mask 646 v = table[i] 647 if v is None: 648 table[i] = value 649 break 650 incr = incr << 1 651 if incr > mask: 652 incr = incr ^ poly 653 654 print n, "collisions" 655 self.collisions = n 656 657 for i in range(len(table)): 658 if table[i] is None: 659 table[i] = 0 660 661 self.data = Array(name + "_hash", table) 662 self.magic = magic 663 self.name = name 664 self.size = size 665 self.poly = poly 666 667 def dump(self, file, trace): 668 # write data to file, as a C array 669 self.data.dump(file, trace) 670 file.write("#define %s_magic %d\n" % (self.name, self.magic)) 671 file.write("#define %s_size %d\n" % (self.name, self.size)) 672 file.write("#define %s_poly %d\n" % (self.name, self.poly)) 673 674# stuff to deal with arrays of unsigned integers 675 676class Array: 677 678 def __init__(self, name, data): 679 self.name = name 680 self.data = data 681 682 def dump(self, file, trace=0): 683 # write data to file, as a C array 684 size = getsize(self.data) 685 if trace: 686 print >>sys.stderr, self.name+":", size*len(self.data), "bytes" 687 file.write("static ") 688 if size == 1: 689 file.write("unsigned char") 690 elif size == 2: 691 file.write("unsigned short") 692 else: 693 file.write("unsigned int") 694 file.write(" " + self.name + "[] = {\n") 695 if self.data: 696 s = " " 697 for item in self.data: 698 i = str(item) + ", " 699 if len(s) + len(i) > 78: 700 file.write(s + "\n") 701 s = " " + i 702 else: 703 s = s + i 704 if s.strip(): 705 file.write(s + "\n") 706 file.write("};\n\n") 707 708def getsize(data): 709 # return smallest possible integer size for the given array 710 maxdata = max(data) 711 if maxdata < 256: 712 return 1 713 elif maxdata < 65536: 714 return 2 715 else: 716 return 4 717 718def splitbins(t, trace=0): 719 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 720 721 t is a sequence of ints. This function can be useful to save space if 722 many of the ints are the same. t1 and t2 are lists of ints, and shift 723 is an int, chosen to minimize the combined size of t1 and t2 (in C 724 code), and where for each i in range(len(t)), 725 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 726 where mask is a bitmask isolating the last "shift" bits. 727 728 If optional arg trace is non-zero (default zero), progress info 729 is printed to sys.stderr. The higher the value, the more info 730 you'll get. 731 """ 732 733 import sys 734 if trace: 735 def dump(t1, t2, shift, bytes): 736 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( 737 len(t1), len(t2), shift, bytes) 738 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \ 739 "bytes" 740 n = len(t)-1 # last valid index 741 maxshift = 0 # the most we can shift n and still have something left 742 if n > 0: 743 while n >> 1: 744 n >>= 1 745 maxshift += 1 746 del n 747 bytes = sys.maxint # smallest total size so far 748 t = tuple(t) # so slices can be dict keys 749 for shift in range(maxshift + 1): 750 t1 = [] 751 t2 = [] 752 size = 2**shift 753 bincache = {} 754 for i in range(0, len(t), size): 755 bin = t[i:i+size] 756 index = bincache.get(bin) 757 if index is None: 758 index = len(t2) 759 bincache[bin] = index 760 t2.extend(bin) 761 t1.append(index >> shift) 762 # determine memory size 763 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 764 if trace > 1: 765 dump(t1, t2, shift, b) 766 if b < bytes: 767 best = t1, t2, shift 768 bytes = b 769 t1, t2, shift = best 770 if trace: 771 print >>sys.stderr, "Best:", 772 dump(t1, t2, shift, bytes) 773 if __debug__: 774 # exhaustively verify that the decomposition is correct 775 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 776 for i in xrange(len(t)): 777 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 778 return best 779 780if __name__ == "__main__": 781 maketables(1) 782