makeunicodedata.py revision 2101348830ff0d65cebd4caf886011f45bcc7618
1# 2# makeunidb.py -- generate a compact version of the unicode property 3# database (unicodedatabase.h) 4# 5 6import sys 7 8SCRIPT = sys.argv[0] 9VERSION = "1.0" 10 11UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt" 12 13CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 14 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 15 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 16 "So" ] 17 18BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 19 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 20 "ON" ] 21 22def maketable(): 23 24 unicode = UnicodeData(UNICODE_DATA) 25 26 # extract unicode properties 27 dummy = (0, 0, 0, 0, "NULL") 28 table = [dummy] 29 cache = {0: dummy} 30 index = [0] * len(unicode.chars) 31 32 DECOMPOSITION = [""] 33 34 for char in unicode.chars: 35 record = unicode.table[char] 36 if record: 37 # extract database properties 38 category = CATEGORY_NAMES.index(record[2]) 39 combining = int(record[3]) 40 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) 41 mirrored = record[9] == "Y" 42 if record[5]: 43 decomposition = '"%s"' % record[5] 44 else: 45 decomposition = "NULL" 46 item = ( 47 category, combining, bidirectional, mirrored, decomposition 48 ) 49 # add entry to index and item tables 50 i = cache.get(item) 51 if i is None: 52 cache[item] = i = len(table) 53 table.append(item) 54 index[char] = i 55 56 # FIXME: we really should compress the decomposition stuff 57 # (see the unidb utilities for one way to do this) 58 59 FILE = "unicodedata_db.h" 60 61 sys.stdout = open(FILE, "w") 62 63 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 64 print 65 print "/* a list of unique database records */" 66 print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" 67 for item in table: 68 print " {%d, %d, %d, %d, %s}," % item 69 print "};" 70 print 71 72 print "/* string literals */" 73 print "const char *_PyUnicode_CategoryNames[] = {" 74 for name in CATEGORY_NAMES: 75 print " \"%s\"," % name 76 print " NULL" 77 print "};" 78 79 print "const char *_PyUnicode_BidirectionalNames[] = {" 80 for name in BIDIRECTIONAL_NAMES: 81 print " \"%s\"," % name 82 print " NULL" 83 print "};" 84 85 # split index table 86 index1, index2, shift = splitbins(index) 87 88 print "/* index tables used to find the right database record */" 89 print "#define SHIFT", shift 90 Array("index1", index1).dump(sys.stdout) 91 Array("index2", index2).dump(sys.stdout) 92 93 sys.stdout = sys.__stdout__ 94 95# -------------------------------------------------------------------- 96# the following support code is taken from the unidb utilities 97# Copyright (c) 1999-2000 by Secret Labs AB 98 99# load a unicode-data file from disk 100 101import string, sys 102 103class UnicodeData: 104 105 def __init__(self, filename): 106 file = open(filename) 107 table = [None] * 65536 108 while 1: 109 s = file.readline() 110 if not s: 111 break 112 s = string.split(string.strip(s), ";") 113 char = string.atoi(s[0], 16) 114 table[char] = s 115 116 # public attributes 117 self.filename = filename 118 self.table = table 119 self.chars = range(65536) # unicode 120 121 def uselatin1(self): 122 # restrict character range to ISO Latin 1 123 self.chars = range(256) 124 125# stuff to deal with arrays of unsigned integers 126 127class Array: 128 129 def __init__(self, name, data): 130 self.name = name 131 self.data = data 132 133 def dump(self, file): 134 # write data to file, as a C array 135 size = getsize(self.data) 136 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes" 137 file.write("static ") 138 if size == 1: 139 file.write("unsigned char") 140 elif size == 2: 141 file.write("unsigned short") 142 else: 143 file.write("unsigned int") 144 file.write(" " + self.name + "[] = {\n") 145 if self.data: 146 s = " " 147 for item in self.data: 148 i = str(item) + ", " 149 if len(s) + len(i) > 78: 150 file.write(s + "\n") 151 s = " " + i 152 else: 153 s = s + i 154 if string.strip(s): 155 file.write(s + "\n") 156 file.write("};\n\n") 157 158def getsize(data): 159 # return smallest possible integer size for the given array 160 maxdata = max(data) 161 if maxdata < 256: 162 return 1 163 elif maxdata < 65536: 164 return 2 165 else: 166 return 4 167 168def splitbins(t, trace=0): 169 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 170 171 t is a sequence of ints. This function can be useful to save space if 172 many of the ints are the same. t1 and t2 are lists of ints, and shift 173 is an int, chosen to minimize the combined size of t1 and t2 (in C 174 code), and where for each i in range(len(t)), 175 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 176 where mask is a bitmask isolating the last "shift" bits. 177 178 If optional arg trace is true (default false), progress info is 179 printed to sys.stderr. 180 """ 181 182 import sys 183 if trace: 184 def dump(t1, t2, shift, bytes): 185 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( 186 len(t1), len(t2), shift, bytes) 187 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \ 188 "bytes" 189 n = len(t)-1 # last valid index 190 maxshift = 0 # the most we can shift n and still have something left 191 if n > 0: 192 while n >> 1: 193 n >>= 1 194 maxshift += 1 195 del n 196 bytes = sys.maxint # smallest total size so far 197 t = tuple(t) # so slices can be dict keys 198 for shift in range(maxshift + 1): 199 t1 = [] 200 t2 = [] 201 size = 2**shift 202 bincache = {} 203 for i in range(0, len(t), size): 204 bin = t[i:i+size] 205 index = bincache.get(bin) 206 if index is None: 207 index = len(t2) 208 bincache[bin] = index 209 t2.extend(bin) 210 t1.append(index >> shift) 211 # determine memory size 212 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 213 if trace: 214 dump(t1, t2, shift, b) 215 if b < bytes: 216 best = t1, t2, shift 217 bytes = b 218 t1, t2, shift = best 219 if trace: 220 print >>sys.stderr, "Best:", 221 dump(t1, t2, shift, bytes) 222 if __debug__: 223 # exhaustively verify that the decomposition is correct 224 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 225 for i in xrange(len(t)): 226 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 227 return best 228 229if __name__ == "__main__": 230 maketable() 231