makeunicodedata.py revision b5c980b8028af0876e40f71c94b82cd02e327bf0
1#
2# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.2 database file to
5# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
7#
8# history:
9# 2000-09-24 fl   created (based on bits and pieces from unidb)
10# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
11# 2000-09-25 fl   added character type table
12# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13# 2000-11-03 fl   expand first/last ranges
14# 2001-01-19 fl   added character name tables (2.1)
15# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
16# 2002-09-11 wd   use string methods
17# 2002-10-18 mvl  update to Unicode 3.2
18# 2002-10-22 mvl  generate NFC tables
19# 2002-11-24 mvl  expand all ranges, sort names version-independently
20# 2002-11-25 mvl  add UNIDATA_VERSION
21#
22# written by Fredrik Lundh (fredrik@pythonware.com)
23#
24
25import sys
26
27SCRIPT = sys.argv[0]
28VERSION = "2.2"
29
30# The Unicode Database
31UNIDATA_VERSION = "3.2.0"
32UNICODE_DATA = "UnicodeData.txt"
33COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
34
35CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
36    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
37    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
38    "So" ]
39
40BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
41    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
42    "ON" ]
43
44# note: should match definitions in Objects/unicodectype.c
45ALPHA_MASK = 0x01
46DECIMAL_MASK = 0x02
47DIGIT_MASK = 0x04
48LOWER_MASK = 0x08
49LINEBREAK_MASK = 0x10
50SPACE_MASK = 0x20
51TITLE_MASK = 0x40
52UPPER_MASK = 0x80
53
54def maketables(trace=0):
55
56    print "--- Reading", UNICODE_DATA, "..."
57
58    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
59
60    print len(filter(None, unicode.table)), "characters"
61
62    makeunicodename(unicode, trace)
63    makeunicodedata(unicode, trace)
64    makeunicodetype(unicode, trace)
65
66# --------------------------------------------------------------------
67# unicode character properties
68
69def makeunicodedata(unicode, trace):
70
71    dummy = (0, 0, 0, 0)
72    table = [dummy]
73    cache = {0: dummy}
74    index = [0] * len(unicode.chars)
75
76    FILE = "Modules/unicodedata_db.h"
77
78    print "--- Preparing", FILE, "..."
79
80    # 1) database properties
81
82    for char in unicode.chars:
83        record = unicode.table[char]
84        if record:
85            # extract database properties
86            category = CATEGORY_NAMES.index(record[2])
87            combining = int(record[3])
88            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
89            mirrored = record[9] == "Y"
90            item = (
91                category, combining, bidirectional, mirrored
92                )
93            # add entry to index and item tables
94            i = cache.get(item)
95            if i is None:
96                cache[item] = i = len(table)
97                table.append(item)
98            index[char] = i
99
100    # 2) decomposition data
101
102    decomp_data = [0]
103    decomp_prefix = [""]
104    decomp_index = [0] * len(unicode.chars)
105    decomp_size = 0
106
107    comp_pairs = []
108    comp_first = [None] * len(unicode.chars)
109    comp_last = [None] * len(unicode.chars)
110
111    for char in unicode.chars:
112        record = unicode.table[char]
113        if record:
114            if record[5]:
115                decomp = record[5].split()
116                # prefix
117                if decomp[0][0] == "<":
118                    prefix = decomp.pop(0)
119                else:
120                    prefix = ""
121                try:
122                    i = decomp_prefix.index(prefix)
123                except ValueError:
124                    i = len(decomp_prefix)
125                    decomp_prefix.append(prefix)
126                prefix = i
127                assert prefix < 256
128                # content
129                decomp = [prefix + (len(decomp)<<8)] +\
130                         map(lambda s: int(s, 16), decomp)
131                # Collect NFC pairs
132                if not prefix and len(decomp) == 3 and \
133                   char not in unicode.exclusions and \
134                   unicode.table[decomp[1]][3] == "0":
135                    p, l, r = decomp
136                    comp_first[l] = 1
137                    comp_last[r] = 1
138                    comp_pairs.append((l,r,char))
139                try:
140                    i = decomp_data.index(decomp)
141                except ValueError:
142                    i = len(decomp_data)
143                    decomp_data.extend(decomp)
144                    decomp_size = decomp_size + len(decomp) * 2
145            else:
146                i = 0
147            decomp_index[char] = i
148
149    f = l = 0
150    comp_first_ranges = []
151    comp_last_ranges = []
152    prev_f = prev_l = None
153    for i in unicode.chars:
154        if comp_first[i] is not None:
155            comp_first[i] = f
156            f += 1
157            if prev_f is None:
158                prev_f = (i,i)
159            elif prev_f[1]+1 == i:
160                prev_f = prev_f[0],i
161            else:
162                comp_first_ranges.append(prev_f)
163                prev_f = (i,i)
164        if comp_last[i] is not None:
165            comp_last[i] = l
166            l += 1
167            if prev_l is None:
168                prev_l = (i,i)
169            elif prev_l[1]+1 == i:
170                prev_l = prev_l[0],i
171            else:
172                comp_last_ranges.append(prev_l)
173                prev_l = (i,i)
174    comp_first_ranges.append(prev_f)
175    comp_last_ranges.append(prev_l)
176    total_first = f
177    total_last = l
178
179    comp_data = [0]*(total_first*total_last)
180    for f,l,char in comp_pairs:
181        f = comp_first[f]
182        l = comp_last[l]
183        comp_data[f*total_last+l] = char
184
185    print len(table), "unique properties"
186    print len(decomp_prefix), "unique decomposition prefixes"
187    print len(decomp_data), "unique decomposition entries:",
188    print decomp_size, "bytes"
189    print total_first, "first characters in NFC"
190    print total_last, "last characters in NFC"
191    print len(comp_pairs), "NFC pairs"
192
193    print "--- Writing", FILE, "..."
194
195    fp = open(FILE, "w")
196    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
197    print >>fp
198    print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
199    print >>fp, "/* a list of unique database records */"
200    print >>fp, \
201          "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
202    for item in table:
203        print >>fp, "    {%d, %d, %d, %d}," % item
204    print >>fp, "};"
205    print >>fp
206
207    print >>fp, "/* Reindexing of NFC first characters. */"
208    print >>fp, "#define TOTAL_FIRST",total_first
209    print >>fp, "#define TOTAL_LAST",total_last
210    print >>fp, "struct reindex{int start;short count,index;};"
211    print >>fp, "struct reindex nfc_first[] = {"
212    for start,end in comp_first_ranges:
213        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
214    print >>fp,"  {0,0,0}"
215    print >>fp,"};\n"
216    print >>fp, "struct reindex nfc_last[] = {"
217    for start,end in comp_last_ranges:
218        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
219    print >>fp,"  {0,0,0}"
220    print >>fp,"};\n"
221
222    # FIXME: <fl> the following tables could be made static, and
223    # the support code moved into unicodedatabase.c
224
225    print >>fp, "/* string literals */"
226    print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
227    for name in CATEGORY_NAMES:
228        print >>fp, "    \"%s\"," % name
229    print >>fp, "    NULL"
230    print >>fp, "};"
231
232    print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
233    for name in BIDIRECTIONAL_NAMES:
234        print >>fp, "    \"%s\"," % name
235    print >>fp, "    NULL"
236    print >>fp, "};"
237
238    print >>fp, "static const char *decomp_prefix[] = {"
239    for name in decomp_prefix:
240        print >>fp, "    \"%s\"," % name
241    print >>fp, "    NULL"
242    print >>fp, "};"
243
244    # split record index table
245    index1, index2, shift = splitbins(index, trace)
246
247    print >>fp, "/* index tables for the database records */"
248    print >>fp, "#define SHIFT", shift
249    Array("index1", index1).dump(fp, trace)
250    Array("index2", index2).dump(fp, trace)
251
252    # split decomposition index table
253    index1, index2, shift = splitbins(decomp_index, trace)
254
255    print >>fp, "/* decomposition data */"
256    Array("decomp_data", decomp_data).dump(fp, trace)
257
258    print >>fp, "/* index tables for the decomposition data */"
259    print >>fp, "#define DECOMP_SHIFT", shift
260    Array("decomp_index1", index1).dump(fp, trace)
261    Array("decomp_index2", index2).dump(fp, trace)
262
263    index, index2, shift = splitbins(comp_data, trace)
264    print >>fp, "/* NFC pairs */"
265    print >>fp, "#define COMP_SHIFT", shift
266    Array("comp_index", index).dump(fp, trace)
267    Array("comp_data", index2).dump(fp, trace)
268
269    fp.close()
270
271# --------------------------------------------------------------------
272# unicode character type tables
273
274def makeunicodetype(unicode, trace):
275
276    FILE = "Objects/unicodetype_db.h"
277
278    print "--- Preparing", FILE, "..."
279
280    # extract unicode types
281    dummy = (0, 0, 0, 0, 0, 0)
282    table = [dummy]
283    cache = {0: dummy}
284    index = [0] * len(unicode.chars)
285
286    for char in unicode.chars:
287        record = unicode.table[char]
288        if record:
289            # extract database properties
290            category = record[2]
291            bidirectional = record[4]
292            flags = 0
293            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
294                flags |= ALPHA_MASK
295            if category == "Ll":
296                flags |= LOWER_MASK
297            if category == "Zl" or bidirectional == "B":
298                flags |= LINEBREAK_MASK
299            if category == "Zs" or bidirectional in ("WS", "B", "S"):
300                flags |= SPACE_MASK
301            if category == "Lt":
302                flags |= TITLE_MASK
303            if category == "Lu":
304                flags |= UPPER_MASK
305            # use delta predictor for upper/lower/title
306            if record[12]:
307                upper = int(record[12], 16) - char
308                assert -32768 <= upper <= 32767
309                upper = upper & 0xffff
310            else:
311                upper = 0
312            if record[13]:
313                lower = int(record[13], 16) - char
314                assert -32768 <= lower <= 32767
315                lower = lower & 0xffff
316            else:
317                lower = 0
318            if record[14]:
319                title = int(record[14], 16) - char
320                assert -32768 <= lower <= 32767
321                title = title & 0xffff
322            else:
323                title = 0
324            # decimal digit, integer digit
325            decimal = 0
326            if record[6]:
327                flags |= DECIMAL_MASK
328                decimal = int(record[6])
329            digit = 0
330            if record[7]:
331                flags |= DIGIT_MASK
332                digit = int(record[7])
333            item = (
334                flags, upper, lower, title, decimal, digit
335                )
336            # add entry to index and item tables
337            i = cache.get(item)
338            if i is None:
339                cache[item] = i = len(table)
340                table.append(item)
341            index[char] = i
342
343    print len(table), "unique character type entries"
344
345    print "--- Writing", FILE, "..."
346
347    fp = open(FILE, "w")
348    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
349    print >>fp
350    print >>fp, "/* a list of unique character type descriptors */"
351    print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
352    for item in table:
353        print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
354    print >>fp, "};"
355    print >>fp
356
357    # split decomposition index table
358    index1, index2, shift = splitbins(index, trace)
359
360    print >>fp, "/* type indexes */"
361    print >>fp, "#define SHIFT", shift
362    Array("index1", index1).dump(fp, trace)
363    Array("index2", index2).dump(fp, trace)
364
365    fp.close()
366
367# --------------------------------------------------------------------
368# unicode name database
369
370def makeunicodename(unicode, trace):
371
372    FILE = "Modules/unicodename_db.h"
373
374    print "--- Preparing", FILE, "..."
375
376    # collect names
377    names = [None] * len(unicode.chars)
378
379    for char in unicode.chars:
380        record = unicode.table[char]
381        if record:
382            name = record[1].strip()
383            if name and name[0] != "<":
384                names[char] = name + chr(0)
385
386    print len(filter(lambda n: n is not None, names)), "distinct names"
387
388    # collect unique words from names (note that we differ between
389    # words inside a sentence, and words ending a sentence.  the
390    # latter includes the trailing null byte.
391
392    words = {}
393    n = b = 0
394    for char in unicode.chars:
395        name = names[char]
396        if name:
397            w = name.split()
398            b = b + len(name)
399            n = n + len(w)
400            for w in w:
401                l = words.get(w)
402                if l:
403                    l.append(None)
404                else:
405                    words[w] = [len(words)]
406
407    print n, "words in text;", b, "bytes"
408
409    wordlist = words.items()
410
411    # sort on falling frequency, then by name
412    def cmpwords((aword, alist),(bword, blist)):
413        r = -cmp(len(alist),len(blist))
414        if r:
415            return r
416        return cmp(aword, bword)
417    wordlist.sort(cmpwords)
418
419    # figure out how many phrasebook escapes we need
420    escapes = 0
421    while escapes * 256 < len(wordlist):
422        escapes = escapes + 1
423    print escapes, "escapes"
424
425    short = 256 - escapes
426
427    assert short > 0
428
429    print short, "short indexes in lexicon"
430
431    # statistics
432    n = 0
433    for i in range(short):
434        n = n + len(wordlist[i][1])
435    print n, "short indexes in phrasebook"
436
437    # pick the most commonly used words, and sort the rest on falling
438    # length (to maximize overlap)
439
440    wordlist, wordtail = wordlist[:short], wordlist[short:]
441    wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
442    wordlist.extend(wordtail)
443
444    # generate lexicon from words
445
446    lexicon_offset = [0]
447    lexicon = ""
448    words = {}
449
450    # build a lexicon string
451    offset = 0
452    for w, x in wordlist:
453        # encoding: bit 7 indicates last character in word (chr(128)
454        # indicates the last character in an entire string)
455        ww = w[:-1] + chr(ord(w[-1])+128)
456        # reuse string tails, when possible
457        o = lexicon.find(ww)
458        if o < 0:
459            o = offset
460            lexicon = lexicon + ww
461            offset = offset + len(w)
462        words[w] = len(lexicon_offset)
463        lexicon_offset.append(o)
464
465    lexicon = map(ord, lexicon)
466
467    # generate phrasebook from names and lexicon
468    phrasebook = [0]
469    phrasebook_offset = [0] * len(unicode.chars)
470    for char in unicode.chars:
471        name = names[char]
472        if name:
473            w = name.split()
474            phrasebook_offset[char] = len(phrasebook)
475            for w in w:
476                i = words[w]
477                if i < short:
478                    phrasebook.append(i)
479                else:
480                    # store as two bytes
481                    phrasebook.append((i>>8) + short)
482                    phrasebook.append(i&255)
483
484    assert getsize(phrasebook) == 1
485
486    #
487    # unicode name hash table
488
489    # extract names
490    data = []
491    for char in unicode.chars:
492        record = unicode.table[char]
493        if record:
494            name = record[1].strip()
495            if name and name[0] != "<":
496                data.append((name, char))
497
498    # the magic number 47 was chosen to minimize the number of
499    # collisions on the current data set.  if you like, change it
500    # and see what happens...
501
502    codehash = Hash("code", data, 47)
503
504    print "--- Writing", FILE, "..."
505
506    fp = open(FILE, "w")
507    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
508    print >>fp
509    print >>fp, "#define NAME_MAXLEN", 256
510    print >>fp
511    print >>fp, "/* lexicon */"
512    Array("lexicon", lexicon).dump(fp, trace)
513    Array("lexicon_offset", lexicon_offset).dump(fp, trace)
514
515    # split decomposition index table
516    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
517
518    print >>fp, "/* code->name phrasebook */"
519    print >>fp, "#define phrasebook_shift", shift
520    print >>fp, "#define phrasebook_short", short
521
522    Array("phrasebook", phrasebook).dump(fp, trace)
523    Array("phrasebook_offset1", offset1).dump(fp, trace)
524    Array("phrasebook_offset2", offset2).dump(fp, trace)
525
526    print >>fp, "/* name->code dictionary */"
527    codehash.dump(fp, trace)
528
529    fp.close()
530
531# --------------------------------------------------------------------
532# the following support code is taken from the unidb utilities
533# Copyright (c) 1999-2000 by Secret Labs AB
534
535# load a unicode-data file from disk
536
537import sys
538
539class UnicodeData:
540
541    def __init__(self, filename, exclusions, expand=1):
542        file = open(filename)
543        table = [None] * 0x110000
544        while 1:
545            s = file.readline()
546            if not s:
547                break
548            s = s.strip().split(";")
549            char = int(s[0], 16)
550            table[char] = s
551
552        # expand first-last ranges
553        if expand:
554            field = None
555            for i in range(0, 0x110000):
556                s = table[i]
557                if s:
558                    if s[1][-6:] == "First>":
559                        s[1] = ""
560                        field = s[:]
561                    elif s[1][-5:] == "Last>":
562                        s[1] = ""
563                        field = None
564                elif field:
565                    field[0] = hex(i)
566                    table[i] = field
567
568        # public attributes
569        self.filename = filename
570        self.table = table
571        self.chars = range(0x110000) # unicode 3.2
572
573        file = open(exclusions)
574        self.exclusions = {}
575        for s in file:
576            s = s.strip()
577            if not s:
578                continue
579            if s[0] == '#':
580                continue
581            char = int(s.split()[0],16)
582            self.exclusions[char] = 1
583
584    def uselatin1(self):
585        # restrict character range to ISO Latin 1
586        self.chars = range(256)
587
588# hash table tools
589
590# this is a straight-forward reimplementation of Python's built-in
591# dictionary type, using a static data structure, and a custom string
592# hash algorithm.
593
594def myhash(s, magic):
595    h = 0
596    for c in map(ord, s.upper()):
597        h = (h * magic) + c
598        ix = h & 0xff000000L
599        if ix:
600            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
601    return h
602
603SIZES = [
604    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
605    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
606    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
607    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
608]
609
610class Hash:
611    def __init__(self, name, data, magic):
612        # turn a (key, value) list into a static hash table structure
613
614        # determine table size
615        for size, poly in SIZES:
616            if size > len(data):
617                poly = size + poly
618                break
619        else:
620            raise AssertionError, "ran out of polynominals"
621
622        print size, "slots in hash table"
623
624        table = [None] * size
625
626        mask = size-1
627
628        n = 0
629
630        hash = myhash
631
632        # initialize hash table
633        for key, value in data:
634            h = hash(key, magic)
635            i = (~h) & mask
636            v = table[i]
637            if v is None:
638                table[i] = value
639                continue
640            incr = (h ^ (h >> 3)) & mask;
641            if not incr:
642                incr = mask
643            while 1:
644                n = n + 1
645                i = (i + incr) & mask
646                v = table[i]
647                if v is None:
648                    table[i] = value
649                    break
650                incr = incr << 1
651                if incr > mask:
652                    incr = incr ^ poly
653
654        print n, "collisions"
655        self.collisions = n
656
657        for i in range(len(table)):
658            if table[i] is None:
659                table[i] = 0
660
661        self.data = Array(name + "_hash", table)
662        self.magic = magic
663        self.name = name
664        self.size = size
665        self.poly = poly
666
667    def dump(self, file, trace):
668        # write data to file, as a C array
669        self.data.dump(file, trace)
670        file.write("#define %s_magic %d\n" % (self.name, self.magic))
671        file.write("#define %s_size %d\n" % (self.name, self.size))
672        file.write("#define %s_poly %d\n" % (self.name, self.poly))
673
674# stuff to deal with arrays of unsigned integers
675
676class Array:
677
678    def __init__(self, name, data):
679        self.name = name
680        self.data = data
681
682    def dump(self, file, trace=0):
683        # write data to file, as a C array
684        size = getsize(self.data)
685        if trace:
686            print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
687        file.write("static ")
688        if size == 1:
689            file.write("unsigned char")
690        elif size == 2:
691            file.write("unsigned short")
692        else:
693            file.write("unsigned int")
694        file.write(" " + self.name + "[] = {\n")
695        if self.data:
696            s = "    "
697            for item in self.data:
698                i = str(item) + ", "
699                if len(s) + len(i) > 78:
700                    file.write(s + "\n")
701                    s = "    " + i
702                else:
703                    s = s + i
704            if s.strip():
705                file.write(s + "\n")
706        file.write("};\n\n")
707
708def getsize(data):
709    # return smallest possible integer size for the given array
710    maxdata = max(data)
711    if maxdata < 256:
712        return 1
713    elif maxdata < 65536:
714        return 2
715    else:
716        return 4
717
718def splitbins(t, trace=0):
719    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
720
721    t is a sequence of ints.  This function can be useful to save space if
722    many of the ints are the same.  t1 and t2 are lists of ints, and shift
723    is an int, chosen to minimize the combined size of t1 and t2 (in C
724    code), and where for each i in range(len(t)),
725        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
726    where mask is a bitmask isolating the last "shift" bits.
727
728    If optional arg trace is non-zero (default zero), progress info
729    is printed to sys.stderr.  The higher the value, the more info
730    you'll get.
731    """
732
733    import sys
734    if trace:
735        def dump(t1, t2, shift, bytes):
736            print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
737                len(t1), len(t2), shift, bytes)
738        print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
739                            "bytes"
740    n = len(t)-1    # last valid index
741    maxshift = 0    # the most we can shift n and still have something left
742    if n > 0:
743        while n >> 1:
744            n >>= 1
745            maxshift += 1
746    del n
747    bytes = sys.maxint  # smallest total size so far
748    t = tuple(t)    # so slices can be dict keys
749    for shift in range(maxshift + 1):
750        t1 = []
751        t2 = []
752        size = 2**shift
753        bincache = {}
754        for i in range(0, len(t), size):
755            bin = t[i:i+size]
756            index = bincache.get(bin)
757            if index is None:
758                index = len(t2)
759                bincache[bin] = index
760                t2.extend(bin)
761            t1.append(index >> shift)
762        # determine memory size
763        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
764        if trace > 1:
765            dump(t1, t2, shift, b)
766        if b < bytes:
767            best = t1, t2, shift
768            bytes = b
769    t1, t2, shift = best
770    if trace:
771        print >>sys.stderr, "Best:",
772        dump(t1, t2, shift, bytes)
773    if __debug__:
774        # exhaustively verify that the decomposition is correct
775        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
776        for i in xrange(len(t)):
777            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
778    return best
779
780if __name__ == "__main__":
781    maketables(1)
782