makeunicodedata.py revision 9c6850510c814dea4c4f5a5c7ff63c5e8ad3976b
1#
2# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
5# Modules/unicodedata_db.h and Objects/unicodetype_db.h
6#
7# history:
8# 2000-09-24 fl   created (based on bits and pieces from unidb)
9# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
10# 2000-09-25 fl   added character type table
11# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields
12#
13# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
14#
15
16import sys
17
18SCRIPT = sys.argv[0]
19VERSION = "1.1"
20
21UNICODE_DATA = "UnicodeData-Latest.txt"
22
23CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
24    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
25    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
26    "So" ]
27
28BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
29    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
30    "ON" ]
31
32# note: should match definitions in Objects/unicodectype.c
33ALPHA_MASK = 0x01
34DECIMAL_MASK = 0x02
35DIGIT_MASK = 0x04
36LOWER_MASK = 0x08
37LINEBREAK_MASK = 0x10
38SPACE_MASK = 0x20
39TITLE_MASK = 0x40
40UPPER_MASK = 0x80
41
42def maketables():
43
44    unicode = UnicodeData(UNICODE_DATA)
45
46    # extract unicode properties
47    dummy = (0, 0, 0, 0)
48    table = [dummy]
49    cache = {0: dummy}
50    index = [0] * len(unicode.chars)
51
52    # 1) database properties
53    for char in unicode.chars:
54        record = unicode.table[char]
55        if record:
56            # extract database properties
57            category = CATEGORY_NAMES.index(record[2])
58            combining = int(record[3])
59            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
60            mirrored = record[9] == "Y"
61            item = (
62                category, combining, bidirectional, mirrored
63                )
64            # add entry to index and item tables
65            i = cache.get(item)
66            if i is None:
67                cache[item] = i = len(table)
68                table.append(item)
69            index[char] = i
70
71    # 2) decomposition data
72
73    # FIXME: <fl> using the encoding stuff from unidb would save
74    # another 50k or so, but I'll leave that for 2.1...
75
76    decomp_data = [""]
77    decomp_index = [0] * len(unicode.chars)
78
79    for char in unicode.chars:
80        record = unicode.table[char]
81        if record:
82            if record[5]:
83                try:
84                    i = decomp_data.index(record[5])
85                except ValueError:
86                    i = len(decomp_data)
87                    decomp_data.append(record[5])
88            else:
89                i = 0
90            decomp_index[char] = i
91
92    FILE = "Modules/unicodedata_db.h"
93
94    fp = open(FILE, "w")
95    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
96    print >>fp
97    print >>fp, "/* a list of unique database records */"
98    print >>fp, \
99          "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
100    for item in table:
101        print >>fp, "    {%d, %d, %d, %d}," % item
102    print >>fp, "};"
103    print >>fp
104
105    # FIXME: the following tables should be made static, and
106    # the support code moved into unicodedatabase.c
107
108    print >>fp, "/* string literals */"
109    print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
110    for name in CATEGORY_NAMES:
111        print >>fp, "    \"%s\"," % name
112    print >>fp, "    NULL"
113    print >>fp, "};"
114
115    print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
116    for name in BIDIRECTIONAL_NAMES:
117        print >>fp, "    \"%s\"," % name
118    print >>fp, "    NULL"
119    print >>fp, "};"
120
121    print >>fp, "static const char *decomp_data[] = {"
122    for name in decomp_data:
123        print >>fp, "    \"%s\"," % name
124    print >>fp, "    NULL"
125    print >>fp, "};"
126
127    # split record index table
128    index1, index2, shift = splitbins(index)
129
130    print >>fp, "/* index tables for the database records */"
131    print >>fp, "#define SHIFT", shift
132    Array("index1", index1).dump(fp)
133    Array("index2", index2).dump(fp)
134
135    # split decomposition index table
136    index1, index2, shift = splitbins(decomp_index)
137
138    print >>fp, "/* index tables for the decomposition data */"
139    print >>fp, "#define DECOMP_SHIFT", shift
140    Array("decomp_index1", index1).dump(fp)
141    Array("decomp_index2", index2).dump(fp)
142
143    #
144    # 3) unicode type data
145
146    # extract unicode types
147    dummy = (0, 0, 0, 0, 0, 0)
148    table = [dummy]
149    cache = {0: dummy}
150    index = [0] * len(unicode.chars)
151
152    for char in unicode.chars:
153        record = unicode.table[char]
154        if record:
155            # extract database properties
156            category = record[2]
157            bidirectional = record[4]
158            flags = 0
159            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
160                flags |= ALPHA_MASK
161            if category == "Ll":
162                flags |= LOWER_MASK
163            if category == "Zl" or bidirectional == "B":
164                flags |= LINEBREAK_MASK
165            if category == "Zs" or bidirectional in ("WS", "B", "S"):
166                flags |= SPACE_MASK
167            if category == "Lt":
168                flags |= TITLE_MASK
169            if category == "Lu":
170                flags |= UPPER_MASK
171            # use delta predictor for upper/lower/title
172            if record[12]:
173                upper = (int(record[12], 16) - char) & 0xffff
174            else:
175                upper = 0
176            if record[13]:
177                lower = (int(record[13], 16) - char) & 0xffff
178            else:
179                lower = 0
180            if record[14]:
181                title = (int(record[14], 16) - char) & 0xffff
182            else:
183                title = 0
184            # decimal digit, integer digit
185            decimal = 0
186            if record[6]:
187                flags |= DECIMAL_MASK
188                decimal = int(record[6])
189            digit = 0
190            if record[7]:
191                flags |= DIGIT_MASK
192                digit = int(record[7])
193            item = (
194                flags, upper, lower, title, decimal, digit
195                )
196            # add entry to index and item tables
197            i = cache.get(item)
198            if i is None:
199                cache[item] = i = len(table)
200                table.append(item)
201            index[char] = i
202
203    print len(table), "ctype entries"
204
205    FILE = "Objects/unicodetype_db.h"
206
207    fp = open(FILE, "w")
208
209    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
210    print >>fp
211    print >>fp, "/* a list of unique character type descriptors */"
212    print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
213    for item in table:
214        print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
215    print >>fp, "};"
216    print >>fp
217
218    # split decomposition index table
219    index1, index2, shift = splitbins(index)
220
221    print >>fp, "/* type indexes */"
222    print >>fp, "#define SHIFT", shift
223    Array("index1", index1).dump(fp)
224    Array("index2", index2).dump(fp)
225
226# --------------------------------------------------------------------
227# the following support code is taken from the unidb utilities
228# Copyright (c) 1999-2000 by Secret Labs AB
229
230# load a unicode-data file from disk
231
232import string, sys
233
234class UnicodeData:
235
236    def __init__(self, filename):
237        file = open(filename)
238        table = [None] * 65536
239        while 1:
240            s = file.readline()
241            if not s:
242                break
243            s = string.split(string.strip(s), ";")
244            char = string.atoi(s[0], 16)
245            table[char] = s
246
247        # public attributes
248        self.filename = filename
249        self.table = table
250        self.chars = range(65536) # unicode
251
252    def uselatin1(self):
253        # restrict character range to ISO Latin 1
254        self.chars = range(256)
255
256# stuff to deal with arrays of unsigned integers
257
258class Array:
259
260    def __init__(self, name, data):
261        self.name = name
262        self.data = data
263
264    def dump(self, file):
265        # write data to file, as a C array
266        size = getsize(self.data)
267        # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
268        file.write("static ")
269        if size == 1:
270            file.write("unsigned char")
271        elif size == 2:
272            file.write("unsigned short")
273        else:
274            file.write("unsigned int")
275        file.write(" " + self.name + "[] = {\n")
276        if self.data:
277            s = "    "
278            for item in self.data:
279                i = str(item) + ", "
280                if len(s) + len(i) > 78:
281                    file.write(s + "\n")
282                    s = "    " + i
283                else:
284                    s = s + i
285            if string.strip(s):
286                file.write(s + "\n")
287        file.write("};\n\n")
288
289def getsize(data):
290    # return smallest possible integer size for the given array
291    maxdata = max(data)
292    if maxdata < 256:
293        return 1
294    elif maxdata < 65536:
295        return 2
296    else:
297        return 4
298
299def splitbins(t, trace=0):
300    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
301
302    t is a sequence of ints.  This function can be useful to save space if
303    many of the ints are the same.  t1 and t2 are lists of ints, and shift
304    is an int, chosen to minimize the combined size of t1 and t2 (in C
305    code), and where for each i in range(len(t)),
306        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
307    where mask is a bitmask isolating the last "shift" bits.
308
309    If optional arg trace is true (default false), progress info is
310    printed to sys.stderr.
311    """
312
313    import sys
314    if trace:
315        def dump(t1, t2, shift, bytes):
316            print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
317                len(t1), len(t2), shift, bytes)
318        print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
319                            "bytes"
320    n = len(t)-1    # last valid index
321    maxshift = 0    # the most we can shift n and still have something left
322    if n > 0:
323        while n >> 1:
324            n >>= 1
325            maxshift += 1
326    del n
327    bytes = sys.maxint  # smallest total size so far
328    t = tuple(t)    # so slices can be dict keys
329    for shift in range(maxshift + 1):
330        t1 = []
331        t2 = []
332        size = 2**shift
333        bincache = {}
334        for i in range(0, len(t), size):
335            bin = t[i:i+size]
336            index = bincache.get(bin)
337            if index is None:
338                index = len(t2)
339                bincache[bin] = index
340                t2.extend(bin)
341            t1.append(index >> shift)
342        # determine memory size
343        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
344        if trace:
345            dump(t1, t2, shift, b)
346        if b < bytes:
347            best = t1, t2, shift
348            bytes = b
349    t1, t2, shift = best
350    if trace:
351        print >>sys.stderr, "Best:",
352        dump(t1, t2, shift, bytes)
353    if __debug__:
354        # exhaustively verify that the decomposition is correct
355        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
356        for i in xrange(len(t)):
357            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
358    return best
359
360if __name__ == "__main__":
361    maketables()
362