makeunicodedata.py revision 2101348830ff0d65cebd4caf886011f45bcc7618
1#
2# makeunidb.py -- generate a compact version of the unicode property
3# database (unicodedatabase.h)
4#
5
6import sys
7
8SCRIPT = sys.argv[0]
9VERSION = "1.0"
10
11UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
12
13CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
14    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
15    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
16    "So" ]
17
18BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
19    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
20    "ON" ]
21
22def maketable():
23
24    unicode = UnicodeData(UNICODE_DATA)
25
26    # extract unicode properties
27    dummy = (0, 0, 0, 0, "NULL")
28    table = [dummy]
29    cache = {0: dummy}
30    index = [0] * len(unicode.chars)
31
32    DECOMPOSITION = [""]
33
34    for char in unicode.chars:
35        record = unicode.table[char]
36        if record:
37            # extract database properties
38            category = CATEGORY_NAMES.index(record[2])
39            combining = int(record[3])
40            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
41            mirrored = record[9] == "Y"
42            if record[5]:
43                decomposition = '"%s"' % record[5]
44            else:
45                decomposition = "NULL"
46            item = (
47                category, combining, bidirectional, mirrored, decomposition
48                )
49            # add entry to index and item tables
50            i = cache.get(item)
51            if i is None:
52                cache[item] = i = len(table)
53                table.append(item)
54            index[char] = i
55
56    # FIXME: we really should compress the decomposition stuff
57    # (see the unidb utilities for one way to do this)
58
59    FILE = "unicodedata_db.h"
60
61    sys.stdout = open(FILE, "w")
62
63    print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
64    print
65    print "/* a list of unique database records */"
66    print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
67    for item in table:
68        print "    {%d, %d, %d, %d, %s}," % item
69    print "};"
70    print
71
72    print "/* string literals */"
73    print "const char *_PyUnicode_CategoryNames[] = {"
74    for name in CATEGORY_NAMES:
75        print "    \"%s\"," % name
76    print "    NULL"
77    print "};"
78
79    print "const char *_PyUnicode_BidirectionalNames[] = {"
80    for name in BIDIRECTIONAL_NAMES:
81        print "    \"%s\"," % name
82    print "    NULL"
83    print "};"
84
85    # split index table
86    index1, index2, shift = splitbins(index)
87
88    print "/* index tables used to find the right database record */"
89    print "#define SHIFT", shift
90    Array("index1", index1).dump(sys.stdout)
91    Array("index2", index2).dump(sys.stdout)
92
93    sys.stdout = sys.__stdout__
94
95# --------------------------------------------------------------------
96# the following support code is taken from the unidb utilities
97# Copyright (c) 1999-2000 by Secret Labs AB
98
99# load a unicode-data file from disk
100
101import string, sys
102
103class UnicodeData:
104
105    def __init__(self, filename):
106        file = open(filename)
107        table = [None] * 65536
108        while 1:
109            s = file.readline()
110            if not s:
111                break
112            s = string.split(string.strip(s), ";")
113            char = string.atoi(s[0], 16)
114            table[char] = s
115
116        # public attributes
117        self.filename = filename
118        self.table = table
119        self.chars = range(65536) # unicode
120
121    def uselatin1(self):
122        # restrict character range to ISO Latin 1
123        self.chars = range(256)
124
125# stuff to deal with arrays of unsigned integers
126
127class Array:
128
129    def __init__(self, name, data):
130        self.name = name
131        self.data = data
132
133    def dump(self, file):
134        # write data to file, as a C array
135        size = getsize(self.data)
136        # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
137        file.write("static ")
138        if size == 1:
139            file.write("unsigned char")
140        elif size == 2:
141            file.write("unsigned short")
142        else:
143            file.write("unsigned int")
144        file.write(" " + self.name + "[] = {\n")
145        if self.data:
146            s = "    "
147            for item in self.data:
148                i = str(item) + ", "
149                if len(s) + len(i) > 78:
150                    file.write(s + "\n")
151                    s = "    " + i
152                else:
153                    s = s + i
154            if string.strip(s):
155                file.write(s + "\n")
156        file.write("};\n\n")
157
158def getsize(data):
159    # return smallest possible integer size for the given array
160    maxdata = max(data)
161    if maxdata < 256:
162        return 1
163    elif maxdata < 65536:
164        return 2
165    else:
166        return 4
167
168def splitbins(t, trace=0):
169    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
170
171    t is a sequence of ints.  This function can be useful to save space if
172    many of the ints are the same.  t1 and t2 are lists of ints, and shift
173    is an int, chosen to minimize the combined size of t1 and t2 (in C
174    code), and where for each i in range(len(t)),
175        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
176    where mask is a bitmask isolating the last "shift" bits.
177
178    If optional arg trace is true (default false), progress info is
179    printed to sys.stderr.
180    """
181
182    import sys
183    if trace:
184        def dump(t1, t2, shift, bytes):
185            print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
186                len(t1), len(t2), shift, bytes)
187        print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
188                            "bytes"
189    n = len(t)-1    # last valid index
190    maxshift = 0    # the most we can shift n and still have something left
191    if n > 0:
192        while n >> 1:
193            n >>= 1
194            maxshift += 1
195    del n
196    bytes = sys.maxint  # smallest total size so far
197    t = tuple(t)    # so slices can be dict keys
198    for shift in range(maxshift + 1):
199        t1 = []
200        t2 = []
201        size = 2**shift
202        bincache = {}
203        for i in range(0, len(t), size):
204            bin = t[i:i+size]
205            index = bincache.get(bin)
206            if index is None:
207                index = len(t2)
208                bincache[bin] = index
209                t2.extend(bin)
210            t1.append(index >> shift)
211        # determine memory size
212        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
213        if trace:
214            dump(t1, t2, shift, b)
215        if b < bytes:
216            best = t1, t2, shift
217            bytes = b
218    t1, t2, shift = best
219    if trace:
220        print >>sys.stderr, "Best:",
221        dump(t1, t2, shift, bytes)
222    if __debug__:
223        # exhaustively verify that the decomposition is correct
224        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
225        for i in xrange(len(t)):
226            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
227    return best
228
229if __name__ == "__main__":
230    maketable()
231