1#
2# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.2 database file to
5# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
7#
8# history:
9# 2000-09-24 fl   created (based on bits and pieces from unidb)
10# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
11# 2000-09-25 fl   added character type table
12# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13# 2000-11-03 fl   expand first/last ranges
14# 2001-01-19 fl   added character name tables (2.1)
15# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
16# 2002-09-11 wd   use string methods
17# 2002-10-18 mvl  update to Unicode 3.2
18# 2002-10-22 mvl  generate NFC tables
19# 2002-11-24 mvl  expand all ranges, sort names version-independently
20# 2002-11-25 mvl  add UNIDATA_VERSION
21# 2004-05-29 perky add east asian width information
22# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
23#
24# written by Fredrik Lundh (fredrik@pythonware.com)
25#
26
27import sys
28
29SCRIPT = sys.argv[0]
30VERSION = "2.6"
31
32# The Unicode Database
33UNIDATA_VERSION = "5.2.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37UNIHAN = "Unihan%s.txt"
38DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
39LINE_BREAK = "LineBreak%s.txt"
40
41old_versions = ["3.2.0"]
42
43CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
44    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
45    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
46    "So" ]
47
48BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
49    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
50    "ON" ]
51
52EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
53
54MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
55
56# note: should match definitions in Objects/unicodectype.c
57ALPHA_MASK = 0x01
58DECIMAL_MASK = 0x02
59DIGIT_MASK = 0x04
60LOWER_MASK = 0x08
61LINEBREAK_MASK = 0x10
62SPACE_MASK = 0x20
63TITLE_MASK = 0x40
64UPPER_MASK = 0x80
65NODELTA_MASK = 0x100
66NUMERIC_MASK = 0x200
67
68def maketables(trace=0):
69
70    print "--- Reading", UNICODE_DATA % "", "..."
71
72    version = ""
73    unicode = UnicodeData(UNICODE_DATA % version,
74                          COMPOSITION_EXCLUSIONS % version,
75                          EASTASIAN_WIDTH % version,
76                          UNIHAN % version,
77                          DERIVEDNORMALIZATION_PROPS % version,
78                          LINE_BREAK % version)
79
80    print len(filter(None, unicode.table)), "characters"
81
82    for version in old_versions:
83        print "--- Reading", UNICODE_DATA % ("-"+version), "..."
84        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
85                                  COMPOSITION_EXCLUSIONS % ("-"+version),
86                                  EASTASIAN_WIDTH % ("-"+version),
87                                  UNIHAN % ("-"+version))
88        print len(filter(None, old_unicode.table)), "characters"
89        merge_old_version(version, unicode, old_unicode)
90
91    makeunicodename(unicode, trace)
92    makeunicodedata(unicode, trace)
93    makeunicodetype(unicode, trace)
94
95# --------------------------------------------------------------------
96# unicode character properties
97
98def makeunicodedata(unicode, trace):
99
100    dummy = (0, 0, 0, 0, 0, 0)
101    table = [dummy]
102    cache = {0: dummy}
103    index = [0] * len(unicode.chars)
104
105    FILE = "Modules/unicodedata_db.h"
106
107    print "--- Preparing", FILE, "..."
108
109    # 1) database properties
110
111    for char in unicode.chars:
112        record = unicode.table[char]
113        if record:
114            # extract database properties
115            category = CATEGORY_NAMES.index(record[2])
116            combining = int(record[3])
117            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
118            mirrored = record[9] == "Y"
119            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
120            normalizationquickcheck = record[17]
121            item = (
122                category, combining, bidirectional, mirrored, eastasianwidth,
123                normalizationquickcheck
124                )
125            # add entry to index and item tables
126            i = cache.get(item)
127            if i is None:
128                cache[item] = i = len(table)
129                table.append(item)
130            index[char] = i
131
132    # 2) decomposition data
133
134    decomp_data = [0]
135    decomp_prefix = [""]
136    decomp_index = [0] * len(unicode.chars)
137    decomp_size = 0
138
139    comp_pairs = []
140    comp_first = [None] * len(unicode.chars)
141    comp_last = [None] * len(unicode.chars)
142
143    for char in unicode.chars:
144        record = unicode.table[char]
145        if record:
146            if record[5]:
147                decomp = record[5].split()
148                if len(decomp) > 19:
149                    raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
150                # prefix
151                if decomp[0][0] == "<":
152                    prefix = decomp.pop(0)
153                else:
154                    prefix = ""
155                try:
156                    i = decomp_prefix.index(prefix)
157                except ValueError:
158                    i = len(decomp_prefix)
159                    decomp_prefix.append(prefix)
160                prefix = i
161                assert prefix < 256
162                # content
163                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
164                # Collect NFC pairs
165                if not prefix and len(decomp) == 3 and \
166                   char not in unicode.exclusions and \
167                   unicode.table[decomp[1]][3] == "0":
168                    p, l, r = decomp
169                    comp_first[l] = 1
170                    comp_last[r] = 1
171                    comp_pairs.append((l,r,char))
172                try:
173                    i = decomp_data.index(decomp)
174                except ValueError:
175                    i = len(decomp_data)
176                    decomp_data.extend(decomp)
177                    decomp_size = decomp_size + len(decomp) * 2
178            else:
179                i = 0
180            decomp_index[char] = i
181
182    f = l = 0
183    comp_first_ranges = []
184    comp_last_ranges = []
185    prev_f = prev_l = None
186    for i in unicode.chars:
187        if comp_first[i] is not None:
188            comp_first[i] = f
189            f += 1
190            if prev_f is None:
191                prev_f = (i,i)
192            elif prev_f[1]+1 == i:
193                prev_f = prev_f[0],i
194            else:
195                comp_first_ranges.append(prev_f)
196                prev_f = (i,i)
197        if comp_last[i] is not None:
198            comp_last[i] = l
199            l += 1
200            if prev_l is None:
201                prev_l = (i,i)
202            elif prev_l[1]+1 == i:
203                prev_l = prev_l[0],i
204            else:
205                comp_last_ranges.append(prev_l)
206                prev_l = (i,i)
207    comp_first_ranges.append(prev_f)
208    comp_last_ranges.append(prev_l)
209    total_first = f
210    total_last = l
211
212    comp_data = [0]*(total_first*total_last)
213    for f,l,char in comp_pairs:
214        f = comp_first[f]
215        l = comp_last[l]
216        comp_data[f*total_last+l] = char
217
218    print len(table), "unique properties"
219    print len(decomp_prefix), "unique decomposition prefixes"
220    print len(decomp_data), "unique decomposition entries:",
221    print decomp_size, "bytes"
222    print total_first, "first characters in NFC"
223    print total_last, "last characters in NFC"
224    print len(comp_pairs), "NFC pairs"
225
226    print "--- Writing", FILE, "..."
227
228    fp = open(FILE, "w")
229    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
230    print >>fp
231    print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
232    print >>fp, "/* a list of unique database records */"
233    print >>fp, \
234          "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
235    for item in table:
236        print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
237    print >>fp, "};"
238    print >>fp
239
240    print >>fp, "/* Reindexing of NFC first characters. */"
241    print >>fp, "#define TOTAL_FIRST",total_first
242    print >>fp, "#define TOTAL_LAST",total_last
243    print >>fp, "struct reindex{int start;short count,index;};"
244    print >>fp, "static struct reindex nfc_first[] = {"
245    for start,end in comp_first_ranges:
246        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
247    print >>fp,"  {0,0,0}"
248    print >>fp,"};\n"
249    print >>fp, "static struct reindex nfc_last[] = {"
250    for start,end in comp_last_ranges:
251        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
252    print >>fp,"  {0,0,0}"
253    print >>fp,"};\n"
254
255    # FIXME: <fl> the following tables could be made static, and
256    # the support code moved into unicodedatabase.c
257
258    print >>fp, "/* string literals */"
259    print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
260    for name in CATEGORY_NAMES:
261        print >>fp, "    \"%s\"," % name
262    print >>fp, "    NULL"
263    print >>fp, "};"
264
265    print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
266    for name in BIDIRECTIONAL_NAMES:
267        print >>fp, "    \"%s\"," % name
268    print >>fp, "    NULL"
269    print >>fp, "};"
270
271    print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
272    for name in EASTASIANWIDTH_NAMES:
273        print >>fp, "    \"%s\"," % name
274    print >>fp, "    NULL"
275    print >>fp, "};"
276
277    print >>fp, "static const char *decomp_prefix[] = {"
278    for name in decomp_prefix:
279        print >>fp, "    \"%s\"," % name
280    print >>fp, "    NULL"
281    print >>fp, "};"
282
283    # split record index table
284    index1, index2, shift = splitbins(index, trace)
285
286    print >>fp, "/* index tables for the database records */"
287    print >>fp, "#define SHIFT", shift
288    Array("index1", index1).dump(fp, trace)
289    Array("index2", index2).dump(fp, trace)
290
291    # split decomposition index table
292    index1, index2, shift = splitbins(decomp_index, trace)
293
294    print >>fp, "/* decomposition data */"
295    Array("decomp_data", decomp_data).dump(fp, trace)
296
297    print >>fp, "/* index tables for the decomposition data */"
298    print >>fp, "#define DECOMP_SHIFT", shift
299    Array("decomp_index1", index1).dump(fp, trace)
300    Array("decomp_index2", index2).dump(fp, trace)
301
302    index, index2, shift = splitbins(comp_data, trace)
303    print >>fp, "/* NFC pairs */"
304    print >>fp, "#define COMP_SHIFT", shift
305    Array("comp_index", index).dump(fp, trace)
306    Array("comp_data", index2).dump(fp, trace)
307
308    # Generate delta tables for old versions
309    for version, table, normalization in unicode.changed:
310        cversion = version.replace(".","_")
311        records = [table[0]]
312        cache = {table[0]:0}
313        index = [0] * len(table)
314        for i, record in enumerate(table):
315            try:
316                index[i] = cache[record]
317            except KeyError:
318                index[i] = cache[record] = len(records)
319                records.append(record)
320        index1, index2, shift = splitbins(index, trace)
321        print >>fp, "static const change_record change_records_%s[] = {" % cversion
322        for record in records:
323            print >>fp, "\t{ %s }," % ", ".join(map(str,record))
324        print >>fp, "};"
325        Array("changes_%s_index" % cversion, index1).dump(fp, trace)
326        Array("changes_%s_data" % cversion, index2).dump(fp, trace)
327        print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
328        print >>fp, "{"
329        print >>fp, "\tint index;"
330        print >>fp, "\tif (n >= 0x110000) index = 0;"
331        print >>fp, "\telse {"
332        print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
333        print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
334              (cversion, shift, ((1<<shift)-1))
335        print >>fp, "\t}"
336        print >>fp, "\treturn change_records_%s+index;" % cversion
337        print >>fp, "}\n"
338        print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
339        print >>fp, "{"
340        print >>fp, "\tswitch(n) {"
341        for k, v in normalization:
342            print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
343        print >>fp, "\tdefault: return 0;"
344        print >>fp, "\t}\n}\n"
345
346    fp.close()
347
348# --------------------------------------------------------------------
349# unicode character type tables
350
351def makeunicodetype(unicode, trace):
352
353    FILE = "Objects/unicodetype_db.h"
354
355    print "--- Preparing", FILE, "..."
356
357    # extract unicode types
358    dummy = (0, 0, 0, 0, 0, 0)
359    table = [dummy]
360    cache = {0: dummy}
361    index = [0] * len(unicode.chars)
362    numeric = {}
363    spaces = []
364    linebreaks = []
365
366    for char in unicode.chars:
367        record = unicode.table[char]
368        if record:
369            # extract database properties
370            category = record[2]
371            bidirectional = record[4]
372            properties = record[16]
373            flags = 0
374            delta = True
375            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
376                flags |= ALPHA_MASK
377            if category == "Ll":
378                flags |= LOWER_MASK
379            if 'Line_Break' in properties or bidirectional == "B":
380                flags |= LINEBREAK_MASK
381                linebreaks.append(char)
382            if category == "Zs" or bidirectional in ("WS", "B", "S"):
383                flags |= SPACE_MASK
384                spaces.append(char)
385            if category == "Lt":
386                flags |= TITLE_MASK
387            if category == "Lu":
388                flags |= UPPER_MASK
389            # use delta predictor for upper/lower/title if it fits
390            if record[12]:
391                upper = int(record[12], 16)
392            else:
393                upper = char
394            if record[13]:
395                lower = int(record[13], 16)
396            else:
397                lower = char
398            if record[14]:
399                title = int(record[14], 16)
400            else:
401                # UCD.html says that a missing title char means that
402                # it defaults to the uppercase character, not to the
403                # character itself. Apparently, in the current UCD (5.x)
404                # this feature is never used
405                title = upper
406            upper_d = upper - char
407            lower_d = lower - char
408            title_d = title - char
409            if -32768 <= upper_d <= 32767 and \
410               -32768 <= lower_d <= 32767 and \
411               -32768 <= title_d <= 32767:
412                # use deltas
413                upper = upper_d & 0xffff
414                lower = lower_d & 0xffff
415                title = title_d & 0xffff
416            else:
417                flags |= NODELTA_MASK
418            # decimal digit, integer digit
419            decimal = 0
420            if record[6]:
421                flags |= DECIMAL_MASK
422                decimal = int(record[6])
423            digit = 0
424            if record[7]:
425                flags |= DIGIT_MASK
426                digit = int(record[7])
427            if record[8]:
428                flags |= NUMERIC_MASK
429                numeric.setdefault(record[8], []).append(char)
430            item = (
431                upper, lower, title, decimal, digit, flags
432                )
433            # add entry to index and item tables
434            i = cache.get(item)
435            if i is None:
436                cache[item] = i = len(table)
437                table.append(item)
438            index[char] = i
439
440    print len(table), "unique character type entries"
441    print sum(map(len, numeric.values())), "numeric code points"
442    print len(spaces), "whitespace code points"
443    print len(linebreaks), "linebreak code points"
444
445    print "--- Writing", FILE, "..."
446
447    fp = open(FILE, "w")
448    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
449    print >>fp
450    print >>fp, "/* a list of unique character type descriptors */"
451    print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
452    for item in table:
453        print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
454    print >>fp, "};"
455    print >>fp
456
457    # split decomposition index table
458    index1, index2, shift = splitbins(index, trace)
459
460    print >>fp, "/* type indexes */"
461    print >>fp, "#define SHIFT", shift
462    Array("index1", index1).dump(fp, trace)
463    Array("index2", index2).dump(fp, trace)
464
465    # Generate code for _PyUnicode_ToNumeric()
466    numeric_items = sorted(numeric.items())
467    print >>fp, '/* Returns the numeric value as double for Unicode characters'
468    print >>fp, ' * having this property, -1.0 otherwise.'
469    print >>fp, ' */'
470    print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
471    print >>fp, '{'
472    print >>fp, '    switch (ch) {'
473    for value, codepoints in numeric_items:
474        # Turn text into float literals
475        parts = value.split('/')
476        parts = [repr(float(part)) for part in parts]
477        value = '/'.join(parts)
478
479        haswide = False
480        hasnonewide = False
481        codepoints.sort()
482        for codepoint in codepoints:
483            if codepoint < 0x10000:
484                hasnonewide = True
485            if codepoint >= 0x10000 and not haswide:
486                print >>fp, '#ifdef Py_UNICODE_WIDE'
487                haswide = True
488            print >>fp, '    case 0x%04X:' % (codepoint,)
489        if haswide and hasnonewide:
490            print >>fp, '#endif'
491        print >>fp, '        return (double) %s;' % (value,)
492        if haswide and not hasnonewide:
493            print >>fp, '#endif'
494    print >>fp,'    }'
495    print >>fp,'    return -1.0;'
496    print >>fp,'}'
497    print >>fp
498
499    # Generate code for _PyUnicode_IsWhitespace()
500    print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
501    print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
502    print >>fp, " */"
503    print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
504    print >>fp, '{'
505    print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
506    print >>fp, '    return iswspace(ch);'
507    print >>fp, '#else'
508    print >>fp, '    switch (ch) {'
509
510    haswide = False
511    hasnonewide = False
512    for codepoint in sorted(spaces):
513        if codepoint < 0x10000:
514            hasnonewide = True
515        if codepoint >= 0x10000 and not haswide:
516            print >>fp, '#ifdef Py_UNICODE_WIDE'
517            haswide = True
518        print >>fp, '    case 0x%04X:' % (codepoint,)
519    if haswide and hasnonewide:
520        print >>fp, '#endif'
521    print >>fp, '        return 1;'
522    if haswide and not hasnonewide:
523        print >>fp, '#endif'
524
525    print >>fp,'    }'
526    print >>fp,'    return 0;'
527    print >>fp, '#endif'
528    print >>fp,'}'
529    print >>fp
530
531    # Generate code for _PyUnicode_IsLinebreak()
532    print >>fp, "/* Returns 1 for Unicode characters having the line break"
533    print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
534    print >>fp, " * type 'B', 0 otherwise."
535    print >>fp, " */"
536    print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
537    print >>fp, '{'
538    print >>fp, '    switch (ch) {'
539    haswide = False
540    hasnonewide = False
541    for codepoint in sorted(linebreaks):
542        if codepoint < 0x10000:
543            hasnonewide = True
544        if codepoint >= 0x10000 and not haswide:
545            print >>fp, '#ifdef Py_UNICODE_WIDE'
546            haswide = True
547        print >>fp, '    case 0x%04X:' % (codepoint,)
548    if haswide and hasnonewide:
549        print >>fp, '#endif'
550    print >>fp, '        return 1;'
551    if haswide and not hasnonewide:
552        print >>fp, '#endif'
553
554    print >>fp,'    }'
555    print >>fp,'    return 0;'
556    print >>fp,'}'
557    print >>fp
558
559    fp.close()
560
561# --------------------------------------------------------------------
562# unicode name database
563
564def makeunicodename(unicode, trace):
565
566    FILE = "Modules/unicodename_db.h"
567
568    print "--- Preparing", FILE, "..."
569
570    # collect names
571    names = [None] * len(unicode.chars)
572
573    for char in unicode.chars:
574        record = unicode.table[char]
575        if record:
576            name = record[1].strip()
577            if name and name[0] != "<":
578                names[char] = name + chr(0)
579
580    print len(filter(lambda n: n is not None, names)), "distinct names"
581
582    # collect unique words from names (note that we differ between
583    # words inside a sentence, and words ending a sentence.  the
584    # latter includes the trailing null byte.
585
586    words = {}
587    n = b = 0
588    for char in unicode.chars:
589        name = names[char]
590        if name:
591            w = name.split()
592            b = b + len(name)
593            n = n + len(w)
594            for w in w:
595                l = words.get(w)
596                if l:
597                    l.append(None)
598                else:
599                    words[w] = [len(words)]
600
601    print n, "words in text;", b, "bytes"
602
603    wordlist = words.items()
604
605    # sort on falling frequency, then by name
606    def word_key(a):
607        aword, alist = a
608        return -len(alist), aword
609    wordlist.sort(key=word_key)
610
611    # figure out how many phrasebook escapes we need
612    escapes = 0
613    while escapes * 256 < len(wordlist):
614        escapes = escapes + 1
615    print escapes, "escapes"
616
617    short = 256 - escapes
618
619    assert short > 0
620
621    print short, "short indexes in lexicon"
622
623    # statistics
624    n = 0
625    for i in range(short):
626        n = n + len(wordlist[i][1])
627    print n, "short indexes in phrasebook"
628
629    # pick the most commonly used words, and sort the rest on falling
630    # length (to maximize overlap)
631
632    wordlist, wordtail = wordlist[:short], wordlist[short:]
633    wordtail.sort(key=lambda a: a[0], reverse=True)
634    wordlist.extend(wordtail)
635
636    # generate lexicon from words
637
638    lexicon_offset = [0]
639    lexicon = ""
640    words = {}
641
642    # build a lexicon string
643    offset = 0
644    for w, x in wordlist:
645        # encoding: bit 7 indicates last character in word (chr(128)
646        # indicates the last character in an entire string)
647        ww = w[:-1] + chr(ord(w[-1])+128)
648        # reuse string tails, when possible
649        o = lexicon.find(ww)
650        if o < 0:
651            o = offset
652            lexicon = lexicon + ww
653            offset = offset + len(w)
654        words[w] = len(lexicon_offset)
655        lexicon_offset.append(o)
656
657    lexicon = map(ord, lexicon)
658
659    # generate phrasebook from names and lexicon
660    phrasebook = [0]
661    phrasebook_offset = [0] * len(unicode.chars)
662    for char in unicode.chars:
663        name = names[char]
664        if name:
665            w = name.split()
666            phrasebook_offset[char] = len(phrasebook)
667            for w in w:
668                i = words[w]
669                if i < short:
670                    phrasebook.append(i)
671                else:
672                    # store as two bytes
673                    phrasebook.append((i>>8) + short)
674                    phrasebook.append(i&255)
675
676    assert getsize(phrasebook) == 1
677
678    #
679    # unicode name hash table
680
681    # extract names
682    data = []
683    for char in unicode.chars:
684        record = unicode.table[char]
685        if record:
686            name = record[1].strip()
687            if name and name[0] != "<":
688                data.append((name, char))
689
690    # the magic number 47 was chosen to minimize the number of
691    # collisions on the current data set.  if you like, change it
692    # and see what happens...
693
694    codehash = Hash("code", data, 47)
695
696    print "--- Writing", FILE, "..."
697
698    fp = open(FILE, "w")
699    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
700    print >>fp
701    print >>fp, "#define NAME_MAXLEN", 256
702    print >>fp
703    print >>fp, "/* lexicon */"
704    Array("lexicon", lexicon).dump(fp, trace)
705    Array("lexicon_offset", lexicon_offset).dump(fp, trace)
706
707    # split decomposition index table
708    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
709
710    print >>fp, "/* code->name phrasebook */"
711    print >>fp, "#define phrasebook_shift", shift
712    print >>fp, "#define phrasebook_short", short
713
714    Array("phrasebook", phrasebook).dump(fp, trace)
715    Array("phrasebook_offset1", offset1).dump(fp, trace)
716    Array("phrasebook_offset2", offset2).dump(fp, trace)
717
718    print >>fp, "/* name->code dictionary */"
719    codehash.dump(fp, trace)
720
721    fp.close()
722
723
724def merge_old_version(version, new, old):
725    # Changes to exclusion file not implemented yet
726    if old.exclusions != new.exclusions:
727        raise NotImplementedError, "exclusions differ"
728
729    # In these change records, 0xFF means "no change"
730    bidir_changes = [0xFF]*0x110000
731    category_changes = [0xFF]*0x110000
732    decimal_changes = [0xFF]*0x110000
733    mirrored_changes = [0xFF]*0x110000
734    # In numeric data, 0 means "no change",
735    # -1 means "did not have a numeric value
736    numeric_changes = [0] * 0x110000
737    # normalization_changes is a list of key-value pairs
738    normalization_changes = []
739    for i in range(0x110000):
740        if new.table[i] is None:
741            # Characters unassigned in the new version ought to
742            # be unassigned in the old one
743            assert old.table[i] is None
744            continue
745        # check characters unassigned in the old version
746        if old.table[i] is None:
747            # category 0 is "unassigned"
748            category_changes[i] = 0
749            continue
750        # check characters that differ
751        if old.table[i] != new.table[i]:
752            for k in range(len(old.table[i])):
753                if old.table[i][k] != new.table[i][k]:
754                    value = old.table[i][k]
755                    if k == 2:
756                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
757                        category_changes[i] = CATEGORY_NAMES.index(value)
758                    elif k == 4:
759                        #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
760                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
761                    elif k == 5:
762                        #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
763                        # We assume that all normalization changes are in 1:1 mappings
764                        assert " " not in value
765                        normalization_changes.append((i, value))
766                    elif k == 6:
767                        #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
768                        # we only support changes where the old value is a single digit
769                        assert value in "0123456789"
770                        decimal_changes[i] = int(value)
771                    elif k == 8:
772                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
773                        # Since 0 encodes "no change", the old value is better not 0
774                        if not value:
775                            numeric_changes[i] = -1
776                        else:
777                            numeric_changes[i] = float(value)
778                            assert numeric_changes[i] not in (0, -1)
779                    elif k == 9:
780                        if value == 'Y':
781                            mirrored_changes[i] = '1'
782                        else:
783                            mirrored_changes[i] = '0'
784                    elif k == 11:
785                        # change to ISO comment, ignore
786                        pass
787                    elif k == 12:
788                        # change to simple uppercase mapping; ignore
789                        pass
790                    elif k == 13:
791                        # change to simple lowercase mapping; ignore
792                        pass
793                    elif k == 14:
794                        # change to simple titlecase mapping; ignore
795                        pass
796                    elif k == 16:
797                        # change to properties; not yet
798                        pass
799                    else:
800                        class Difference(Exception):pass
801                        raise Difference, (hex(i), k, old.table[i], new.table[i])
802    new.changed.append((version, zip(bidir_changes, category_changes,
803                                     decimal_changes, mirrored_changes,
804                                     numeric_changes),
805                        normalization_changes))
806
807
808# --------------------------------------------------------------------
809# the following support code is taken from the unidb utilities
810# Copyright (c) 1999-2000 by Secret Labs AB
811
812# load a unicode-data file from disk
813
814class UnicodeData:
815    # Record structure:
816    # [ID, name, category, combining, bidi, decomp,  (6)
817    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
818    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
819    #  properties] (17)
820
821    def __init__(self, filename, exclusions, eastasianwidth, unihan,
822                 derivednormalizationprops=None, linebreakprops=None,
823                 expand=1):
824        self.changed = []
825        file = open(filename)
826        table = [None] * 0x110000
827        while 1:
828            s = file.readline()
829            if not s:
830                break
831            s = s.strip().split(";")
832            char = int(s[0], 16)
833            table[char] = s
834
835        # expand first-last ranges
836        if expand:
837            field = None
838            for i in range(0, 0x110000):
839                s = table[i]
840                if s:
841                    if s[1][-6:] == "First>":
842                        s[1] = ""
843                        field = s
844                    elif s[1][-5:] == "Last>":
845                        s[1] = ""
846                        field = None
847                elif field:
848                    f2 = field[:]
849                    f2[0] = "%X" % i
850                    table[i] = f2
851
852        # public attributes
853        self.filename = filename
854        self.table = table
855        self.chars = range(0x110000) # unicode 3.2
856
857        file = open(exclusions)
858        self.exclusions = {}
859        for s in file:
860            s = s.strip()
861            if not s:
862                continue
863            if s[0] == '#':
864                continue
865            char = int(s.split()[0],16)
866            self.exclusions[char] = 1
867
868        widths = [None] * 0x110000
869        for s in open(eastasianwidth):
870            s = s.strip()
871            if not s:
872                continue
873            if s[0] == '#':
874                continue
875            s = s.split()[0].split(';')
876            if '..' in s[0]:
877                first, last = [int(c, 16) for c in s[0].split('..')]
878                chars = range(first, last+1)
879            else:
880                chars = [int(s[0], 16)]
881            for char in chars:
882                widths[char] = s[1]
883        for i in range(0, 0x110000):
884            if table[i] is not None:
885                table[i].append(widths[i])
886
887        for i in range(0, 0x110000):
888            if table[i] is not None:
889                table[i].append(set())
890        if linebreakprops:
891            for s in open(linebreakprops):
892                s = s.partition('#')[0]
893                s = [i.strip() for i in s.split(';')]
894                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
895                    continue
896                if '..' not in s[0]:
897                    first = last = int(s[0], 16)
898                else:
899                    first, last = [int(c, 16) for c in s[0].split('..')]
900                for char in range(first, last+1):
901                    table[char][-1].add('Line_Break')
902
903        if derivednormalizationprops:
904            quickchecks = [0] * 0x110000 # default is Yes
905            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
906            for s in open(derivednormalizationprops):
907                if '#' in s:
908                    s = s[:s.index('#')]
909                s = [i.strip() for i in s.split(';')]
910                if len(s) < 2 or s[1] not in qc_order:
911                    continue
912                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
913                quickcheck_shift = qc_order.index(s[1])*2
914                quickcheck <<= quickcheck_shift
915                if '..' not in s[0]:
916                    first = last = int(s[0], 16)
917                else:
918                    first, last = [int(c, 16) for c in s[0].split('..')]
919                for char in range(first, last+1):
920                    assert not (quickchecks[char]>>quickcheck_shift)&3
921                    quickchecks[char] |= quickcheck
922            for i in range(0, 0x110000):
923                if table[i] is not None:
924                    table[i].append(quickchecks[i])
925
926        for line in open(unihan):
927            if not line.startswith('U+'):
928                continue
929            code, tag, value = line.split(None, 3)[:3]
930            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
931                           'kOtherNumeric'):
932                continue
933            value = value.strip().replace(',', '')
934            i = int(code[2:], 16)
935            # Patch the numeric field
936            if table[i] is not None:
937                table[i][8] = value
938
939    def uselatin1(self):
940        # restrict character range to ISO Latin 1
941        self.chars = range(256)
942
943# hash table tools
944
945# this is a straight-forward reimplementation of Python's built-in
946# dictionary type, using a static data structure, and a custom string
947# hash algorithm.
948
949def myhash(s, magic):
950    h = 0
951    for c in map(ord, s.upper()):
952        h = (h * magic) + c
953        ix = h & 0xff000000L
954        if ix:
955            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
956    return h
957
958SIZES = [
959    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
960    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
961    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
962    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
963]
964
965class Hash:
966    def __init__(self, name, data, magic):
967        # turn a (key, value) list into a static hash table structure
968
969        # determine table size
970        for size, poly in SIZES:
971            if size > len(data):
972                poly = size + poly
973                break
974        else:
975            raise AssertionError, "ran out of polynominals"
976
977        print size, "slots in hash table"
978
979        table = [None] * size
980
981        mask = size-1
982
983        n = 0
984
985        hash = myhash
986
987        # initialize hash table
988        for key, value in data:
989            h = hash(key, magic)
990            i = (~h) & mask
991            v = table[i]
992            if v is None:
993                table[i] = value
994                continue
995            incr = (h ^ (h >> 3)) & mask;
996            if not incr:
997                incr = mask
998            while 1:
999                n = n + 1
1000                i = (i + incr) & mask
1001                v = table[i]
1002                if v is None:
1003                    table[i] = value
1004                    break
1005                incr = incr << 1
1006                if incr > mask:
1007                    incr = incr ^ poly
1008
1009        print n, "collisions"
1010        self.collisions = n
1011
1012        for i in range(len(table)):
1013            if table[i] is None:
1014                table[i] = 0
1015
1016        self.data = Array(name + "_hash", table)
1017        self.magic = magic
1018        self.name = name
1019        self.size = size
1020        self.poly = poly
1021
1022    def dump(self, file, trace):
1023        # write data to file, as a C array
1024        self.data.dump(file, trace)
1025        file.write("#define %s_magic %d\n" % (self.name, self.magic))
1026        file.write("#define %s_size %d\n" % (self.name, self.size))
1027        file.write("#define %s_poly %d\n" % (self.name, self.poly))
1028
1029# stuff to deal with arrays of unsigned integers
1030
1031class Array:
1032
1033    def __init__(self, name, data):
1034        self.name = name
1035        self.data = data
1036
1037    def dump(self, file, trace=0):
1038        # write data to file, as a C array
1039        size = getsize(self.data)
1040        if trace:
1041            print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
1042        file.write("static ")
1043        if size == 1:
1044            file.write("unsigned char")
1045        elif size == 2:
1046            file.write("unsigned short")
1047        else:
1048            file.write("unsigned int")
1049        file.write(" " + self.name + "[] = {\n")
1050        if self.data:
1051            s = "    "
1052            for item in self.data:
1053                i = str(item) + ", "
1054                if len(s) + len(i) > 78:
1055                    file.write(s + "\n")
1056                    s = "    " + i
1057                else:
1058                    s = s + i
1059            if s.strip():
1060                file.write(s + "\n")
1061        file.write("};\n\n")
1062
1063def getsize(data):
1064    # return smallest possible integer size for the given array
1065    maxdata = max(data)
1066    if maxdata < 256:
1067        return 1
1068    elif maxdata < 65536:
1069        return 2
1070    else:
1071        return 4
1072
1073def splitbins(t, trace=0):
1074    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1075
1076    t is a sequence of ints.  This function can be useful to save space if
1077    many of the ints are the same.  t1 and t2 are lists of ints, and shift
1078    is an int, chosen to minimize the combined size of t1 and t2 (in C
1079    code), and where for each i in range(len(t)),
1080        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1081    where mask is a bitmask isolating the last "shift" bits.
1082
1083    If optional arg trace is non-zero (default zero), progress info
1084    is printed to sys.stderr.  The higher the value, the more info
1085    you'll get.
1086    """
1087
1088    if trace:
1089        def dump(t1, t2, shift, bytes):
1090            print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1091                len(t1), len(t2), shift, bytes)
1092        print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1093                            "bytes"
1094    n = len(t)-1    # last valid index
1095    maxshift = 0    # the most we can shift n and still have something left
1096    if n > 0:
1097        while n >> 1:
1098            n >>= 1
1099            maxshift += 1
1100    del n
1101    bytes = sys.maxint  # smallest total size so far
1102    t = tuple(t)    # so slices can be dict keys
1103    for shift in range(maxshift + 1):
1104        t1 = []
1105        t2 = []
1106        size = 2**shift
1107        bincache = {}
1108        for i in range(0, len(t), size):
1109            bin = t[i:i+size]
1110            index = bincache.get(bin)
1111            if index is None:
1112                index = len(t2)
1113                bincache[bin] = index
1114                t2.extend(bin)
1115            t1.append(index >> shift)
1116        # determine memory size
1117        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1118        if trace > 1:
1119            dump(t1, t2, shift, b)
1120        if b < bytes:
1121            best = t1, t2, shift
1122            bytes = b
1123    t1, t2, shift = best
1124    if trace:
1125        print >>sys.stderr, "Best:",
1126        dump(t1, t2, shift, bytes)
1127    if __debug__:
1128        # exhaustively verify that the decomposition is correct
1129        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1130        for i in xrange(len(t)):
1131            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1132    return best
1133
1134if __name__ == "__main__":
1135    maketables(1)
1136