makeunicodedata.py revision 9c6850510c814dea4c4f5a5c7ff63c5e8ad3976b
1# 2# (re)generate unicode property and type databases 3# 4# this script converts a unicode 3.0 database file to 5# Modules/unicodedata_db.h and Objects/unicodetype_db.h 6# 7# history: 8# 2000-09-24 fl created (based on bits and pieces from unidb) 9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 10# 2000-09-25 fl added character type table 11# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields 12# 13# written by Fredrik Lundh (fredrik@pythonware.com), September 2000 14# 15 16import sys 17 18SCRIPT = sys.argv[0] 19VERSION = "1.1" 20 21UNICODE_DATA = "UnicodeData-Latest.txt" 22 23CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 24 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 25 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 26 "So" ] 27 28BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 29 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 30 "ON" ] 31 32# note: should match definitions in Objects/unicodectype.c 33ALPHA_MASK = 0x01 34DECIMAL_MASK = 0x02 35DIGIT_MASK = 0x04 36LOWER_MASK = 0x08 37LINEBREAK_MASK = 0x10 38SPACE_MASK = 0x20 39TITLE_MASK = 0x40 40UPPER_MASK = 0x80 41 42def maketables(): 43 44 unicode = UnicodeData(UNICODE_DATA) 45 46 # extract unicode properties 47 dummy = (0, 0, 0, 0) 48 table = [dummy] 49 cache = {0: dummy} 50 index = [0] * len(unicode.chars) 51 52 # 1) database properties 53 for char in unicode.chars: 54 record = unicode.table[char] 55 if record: 56 # extract database properties 57 category = CATEGORY_NAMES.index(record[2]) 58 combining = int(record[3]) 59 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) 60 mirrored = record[9] == "Y" 61 item = ( 62 category, combining, bidirectional, mirrored 63 ) 64 # add entry to index and item tables 65 i = cache.get(item) 66 if i is None: 67 cache[item] = i = len(table) 68 table.append(item) 69 index[char] = i 70 71 # 2) decomposition data 72 73 # FIXME: <fl> using the encoding stuff from unidb would save 74 # another 50k or so, but I'll leave that for 2.1... 75 76 decomp_data = [""] 77 decomp_index = [0] * len(unicode.chars) 78 79 for char in unicode.chars: 80 record = unicode.table[char] 81 if record: 82 if record[5]: 83 try: 84 i = decomp_data.index(record[5]) 85 except ValueError: 86 i = len(decomp_data) 87 decomp_data.append(record[5]) 88 else: 89 i = 0 90 decomp_index[char] = i 91 92 FILE = "Modules/unicodedata_db.h" 93 94 fp = open(FILE, "w") 95 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 96 print >>fp 97 print >>fp, "/* a list of unique database records */" 98 print >>fp, \ 99 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" 100 for item in table: 101 print >>fp, " {%d, %d, %d, %d}," % item 102 print >>fp, "};" 103 print >>fp 104 105 # FIXME: the following tables should be made static, and 106 # the support code moved into unicodedatabase.c 107 108 print >>fp, "/* string literals */" 109 print >>fp, "const char *_PyUnicode_CategoryNames[] = {" 110 for name in CATEGORY_NAMES: 111 print >>fp, " \"%s\"," % name 112 print >>fp, " NULL" 113 print >>fp, "};" 114 115 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {" 116 for name in BIDIRECTIONAL_NAMES: 117 print >>fp, " \"%s\"," % name 118 print >>fp, " NULL" 119 print >>fp, "};" 120 121 print >>fp, "static const char *decomp_data[] = {" 122 for name in decomp_data: 123 print >>fp, " \"%s\"," % name 124 print >>fp, " NULL" 125 print >>fp, "};" 126 127 # split record index table 128 index1, index2, shift = splitbins(index) 129 130 print >>fp, "/* index tables for the database records */" 131 print >>fp, "#define SHIFT", shift 132 Array("index1", index1).dump(fp) 133 Array("index2", index2).dump(fp) 134 135 # split decomposition index table 136 index1, index2, shift = splitbins(decomp_index) 137 138 print >>fp, "/* index tables for the decomposition data */" 139 print >>fp, "#define DECOMP_SHIFT", shift 140 Array("decomp_index1", index1).dump(fp) 141 Array("decomp_index2", index2).dump(fp) 142 143 # 144 # 3) unicode type data 145 146 # extract unicode types 147 dummy = (0, 0, 0, 0, 0, 0) 148 table = [dummy] 149 cache = {0: dummy} 150 index = [0] * len(unicode.chars) 151 152 for char in unicode.chars: 153 record = unicode.table[char] 154 if record: 155 # extract database properties 156 category = record[2] 157 bidirectional = record[4] 158 flags = 0 159 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 160 flags |= ALPHA_MASK 161 if category == "Ll": 162 flags |= LOWER_MASK 163 if category == "Zl" or bidirectional == "B": 164 flags |= LINEBREAK_MASK 165 if category == "Zs" or bidirectional in ("WS", "B", "S"): 166 flags |= SPACE_MASK 167 if category == "Lt": 168 flags |= TITLE_MASK 169 if category == "Lu": 170 flags |= UPPER_MASK 171 # use delta predictor for upper/lower/title 172 if record[12]: 173 upper = (int(record[12], 16) - char) & 0xffff 174 else: 175 upper = 0 176 if record[13]: 177 lower = (int(record[13], 16) - char) & 0xffff 178 else: 179 lower = 0 180 if record[14]: 181 title = (int(record[14], 16) - char) & 0xffff 182 else: 183 title = 0 184 # decimal digit, integer digit 185 decimal = 0 186 if record[6]: 187 flags |= DECIMAL_MASK 188 decimal = int(record[6]) 189 digit = 0 190 if record[7]: 191 flags |= DIGIT_MASK 192 digit = int(record[7]) 193 item = ( 194 flags, upper, lower, title, decimal, digit 195 ) 196 # add entry to index and item tables 197 i = cache.get(item) 198 if i is None: 199 cache[item] = i = len(table) 200 table.append(item) 201 index[char] = i 202 203 print len(table), "ctype entries" 204 205 FILE = "Objects/unicodetype_db.h" 206 207 fp = open(FILE, "w") 208 209 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) 210 print >>fp 211 print >>fp, "/* a list of unique character type descriptors */" 212 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {" 213 for item in table: 214 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item 215 print >>fp, "};" 216 print >>fp 217 218 # split decomposition index table 219 index1, index2, shift = splitbins(index) 220 221 print >>fp, "/* type indexes */" 222 print >>fp, "#define SHIFT", shift 223 Array("index1", index1).dump(fp) 224 Array("index2", index2).dump(fp) 225 226# -------------------------------------------------------------------- 227# the following support code is taken from the unidb utilities 228# Copyright (c) 1999-2000 by Secret Labs AB 229 230# load a unicode-data file from disk 231 232import string, sys 233 234class UnicodeData: 235 236 def __init__(self, filename): 237 file = open(filename) 238 table = [None] * 65536 239 while 1: 240 s = file.readline() 241 if not s: 242 break 243 s = string.split(string.strip(s), ";") 244 char = string.atoi(s[0], 16) 245 table[char] = s 246 247 # public attributes 248 self.filename = filename 249 self.table = table 250 self.chars = range(65536) # unicode 251 252 def uselatin1(self): 253 # restrict character range to ISO Latin 1 254 self.chars = range(256) 255 256# stuff to deal with arrays of unsigned integers 257 258class Array: 259 260 def __init__(self, name, data): 261 self.name = name 262 self.data = data 263 264 def dump(self, file): 265 # write data to file, as a C array 266 size = getsize(self.data) 267 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes" 268 file.write("static ") 269 if size == 1: 270 file.write("unsigned char") 271 elif size == 2: 272 file.write("unsigned short") 273 else: 274 file.write("unsigned int") 275 file.write(" " + self.name + "[] = {\n") 276 if self.data: 277 s = " " 278 for item in self.data: 279 i = str(item) + ", " 280 if len(s) + len(i) > 78: 281 file.write(s + "\n") 282 s = " " + i 283 else: 284 s = s + i 285 if string.strip(s): 286 file.write(s + "\n") 287 file.write("};\n\n") 288 289def getsize(data): 290 # return smallest possible integer size for the given array 291 maxdata = max(data) 292 if maxdata < 256: 293 return 1 294 elif maxdata < 65536: 295 return 2 296 else: 297 return 4 298 299def splitbins(t, trace=0): 300 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 301 302 t is a sequence of ints. This function can be useful to save space if 303 many of the ints are the same. t1 and t2 are lists of ints, and shift 304 is an int, chosen to minimize the combined size of t1 and t2 (in C 305 code), and where for each i in range(len(t)), 306 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 307 where mask is a bitmask isolating the last "shift" bits. 308 309 If optional arg trace is true (default false), progress info is 310 printed to sys.stderr. 311 """ 312 313 import sys 314 if trace: 315 def dump(t1, t2, shift, bytes): 316 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( 317 len(t1), len(t2), shift, bytes) 318 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \ 319 "bytes" 320 n = len(t)-1 # last valid index 321 maxshift = 0 # the most we can shift n and still have something left 322 if n > 0: 323 while n >> 1: 324 n >>= 1 325 maxshift += 1 326 del n 327 bytes = sys.maxint # smallest total size so far 328 t = tuple(t) # so slices can be dict keys 329 for shift in range(maxshift + 1): 330 t1 = [] 331 t2 = [] 332 size = 2**shift 333 bincache = {} 334 for i in range(0, len(t), size): 335 bin = t[i:i+size] 336 index = bincache.get(bin) 337 if index is None: 338 index = len(t2) 339 bincache[bin] = index 340 t2.extend(bin) 341 t1.append(index >> shift) 342 # determine memory size 343 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 344 if trace: 345 dump(t1, t2, shift, b) 346 if b < bytes: 347 best = t1, t2, shift 348 bytes = b 349 t1, t2, shift = best 350 if trace: 351 print >>sys.stderr, "Best:", 352 dump(t1, t2, shift, bytes) 353 if __debug__: 354 # exhaustively verify that the decomposition is correct 355 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 356 for i in xrange(len(t)): 357 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 358 return best 359 360if __name__ == "__main__": 361 maketables() 362