gencodec.py revision 3f767795f6784ca6bf797b055be67fce5bf2fa06
1""" Unicode Mapping Parser and Codec Generator. 2 3This script parses Unicode mapping files as available from the Unicode 4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec 5modules from them. The codecs use the standard character mapping codec 6to actually apply the mapping. 7 8Synopsis: gencodec.py dir codec_prefix 9 10All files in dir are scanned and those producing non-empty mappings 11will be written to <codec_prefix><mapname>.py with <mapname> being the 12first part of the map's filename ('a' in a.b.c.txt) converted to 13lowercase with hyphens replaced by underscores. 14 15The tool also writes marshalled versions of the mapping tables to the 16same location (with .mapping extension). 17 18Written by Marc-Andre Lemburg (mal@lemburg.com). 19 20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 21(c) Copyright Guido van Rossum, 2000. 22 23Table generation: 24(c) Copyright Marc-Andre Lemburg, 2005. 25 Licensed to PSF under a Contributor Agreement. 26 27"""#" 28 29import re, os, time, marshal, codecs 30 31# Maximum allowed size of charmap tables 32MAX_TABLE_SIZE = 8192 33 34# Standard undefined Unicode code point 35UNI_UNDEFINED = unichr(0xFFFE) 36 37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' 38 '\s+' 39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' 40 '\s*' 41 '(#.+)?') 42 43def parsecodes(codes, 44 len=len, filter=filter,range=range): 45 46 """ Converts code combinations to either a single code integer 47 or a tuple of integers. 48 49 meta-codes (in angular brackets, e.g. <LR> and <RL>) are 50 ignored. 51 52 Empty codes or illegal ones are returned as None. 53 54 """ 55 if not codes: 56 return None 57 l = codes.split('+') 58 if len(l) == 1: 59 return int(l[0],16) 60 for i in range(len(l)): 61 try: 62 l[i] = int(l[i],16) 63 except ValueError: 64 l[i] = None 65 l = filter(lambda x: x is not None, l) 66 if len(l) == 1: 67 return l[0] 68 else: 69 return tuple(l) 70 71def readmap(filename): 72 73 f = open(filename,'r') 74 lines = f.readlines() 75 f.close() 76 enc2uni = {} 77 identity = [] 78 unmapped = range(256) 79 80 # UTC mapping tables per convention don't include the identity 81 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are 82 # explicitly mapped to different characters or undefined 83 for i in range(32) + [127]: 84 identity.append(i) 85 unmapped.remove(i) 86 enc2uni[i] = (i, 'CONTROL CHARACTER') 87 88 for line in lines: 89 line = line.strip() 90 if not line or line[0] == '#': 91 continue 92 m = mapRE.match(line) 93 if not m: 94 #print '* not matched: %s' % repr(line) 95 continue 96 enc,uni,comment = m.groups() 97 enc = parsecodes(enc) 98 uni = parsecodes(uni) 99 if comment is None: 100 comment = '' 101 else: 102 comment = comment[1:].strip() 103 if enc < 256: 104 if enc in unmapped: 105 unmapped.remove(enc) 106 if enc == uni: 107 identity.append(enc) 108 enc2uni[enc] = (uni,comment) 109 else: 110 enc2uni[enc] = (uni,comment) 111 112 # If there are more identity-mapped entries than unmapped entries, 113 # it pays to generate an identity dictionary first, and add explicit 114 # mappings to None for the rest 115 if len(identity) >= len(unmapped): 116 for enc in unmapped: 117 enc2uni[enc] = (None, "") 118 enc2uni['IDENTITY'] = 256 119 120 return enc2uni 121 122def hexrepr(t, precision=4): 123 124 if t is None: 125 return 'None' 126 try: 127 len(t) 128 except: 129 return '0x%0*X' % (precision, t) 130 try: 131 return '(' + ', '.join(['0x%0*X' % (precision, item) 132 for item in t]) + ')' 133 except TypeError, why: 134 print '* failed to convert %r: %s' % (t, why) 135 raise 136 137def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): 138 139 l = [] 140 append = l.append 141 if map.has_key("IDENTITY"): 142 append("%s = codecs.make_identity_dict(range(%d))" % 143 (varname, map["IDENTITY"])) 144 append("%s.update({" % varname) 145 splits = 1 146 del map["IDENTITY"] 147 identity = 1 148 else: 149 append("%s = {" % varname) 150 splits = 0 151 identity = 0 152 153 mappings = map.items() 154 mappings.sort() 155 i = 0 156 key_precision, value_precision = precisions 157 for mapkey, mapvalue in mappings: 158 mapcomment = '' 159 if isinstance(mapkey, tuple): 160 (mapkey, mapcomment) = mapkey 161 if isinstance(mapvalue, tuple): 162 (mapvalue, mapcomment) = mapvalue 163 if mapkey is None: 164 continue 165 if (identity and 166 mapkey == mapvalue and 167 mapkey < 256): 168 # No need to include identity mappings, since these 169 # are already set for the first 256 code points. 170 continue 171 key = hexrepr(mapkey, key_precision) 172 value = hexrepr(mapvalue, value_precision) 173 if mapcomment and comments: 174 append(' %s: %s,\t# %s' % (key, value, mapcomment)) 175 else: 176 append(' %s: %s,' % (key, value)) 177 i += 1 178 if i == 4096: 179 # Split the definition into parts to that the Python 180 # parser doesn't dump core 181 if splits == 0: 182 append('}') 183 else: 184 append('})') 185 append('%s.update({' % varname) 186 i = 0 187 splits = splits + 1 188 if splits == 0: 189 append('}') 190 else: 191 append('})') 192 193 return l 194 195def python_tabledef_code(varname, map, comments=1, key_precision=2): 196 197 l = [] 198 append = l.append 199 append('%s = (' % varname) 200 201 # Analyze map and create table dict 202 mappings = map.items() 203 mappings.sort() 204 table = {} 205 maxkey = 0 206 if map.has_key('IDENTITY'): 207 for key in range(256): 208 table[key] = (key, '') 209 maxkey = 255 210 del map['IDENTITY'] 211 for mapkey, mapvalue in mappings: 212 mapcomment = '' 213 if isinstance(mapkey, tuple): 214 (mapkey, mapcomment) = mapkey 215 if isinstance(mapvalue, tuple): 216 (mapvalue, mapcomment) = mapvalue 217 if mapkey is None: 218 continue 219 table[mapkey] = (mapvalue, mapcomment) 220 if mapkey > maxkey: 221 maxkey = mapkey 222 if maxkey > MAX_TABLE_SIZE: 223 # Table too large 224 return None 225 226 # Create table code 227 for key in range(maxkey + 1): 228 if key not in table: 229 mapvalue = None 230 mapcomment = 'UNDEFINED' 231 else: 232 mapvalue, mapcomment = table[key] 233 if mapvalue is None: 234 mapchar = UNI_UNDEFINED 235 else: 236 if isinstance(mapvalue, tuple): 237 # 1-n mappings not supported 238 return None 239 else: 240 mapchar = unichr(mapvalue) 241 if mapcomment and comments: 242 append(' %r\t# %s -> %s' % (mapchar, 243 hexrepr(key, key_precision), 244 mapcomment)) 245 else: 246 append(' %r' % mapchar) 247 248 append(')') 249 return l 250 251def codegen(name, map, encodingname, comments=1): 252 253 """ Returns Python source for the given map. 254 255 Comments are included in the source, if comments is true (default). 256 257 """ 258 # Generate code 259 decoding_map_code = python_mapdef_code( 260 'decoding_map', 261 map, 262 comments=comments) 263 decoding_table_code = python_tabledef_code( 264 'decoding_table', 265 map, 266 comments=comments) 267 encoding_map_code = python_mapdef_code( 268 'encoding_map', 269 codecs.make_encoding_map(map), 270 comments=comments, 271 precisions=(4, 2)) 272 273 if decoding_table_code: 274 suffix = 'table' 275 else: 276 suffix = 'map' 277 278 l = [ 279 '''\ 280""" Python Character Mapping Codec %s generated from '%s' with gencodec.py. 281 282"""#" 283 284import codecs 285 286### Codec APIs 287 288class Codec(codecs.Codec): 289 290 def encode(self,input,errors='strict'): 291 return codecs.charmap_encode(input,errors,encoding_%s) 292 293 def decode(self,input,errors='strict'): 294 return codecs.charmap_decode(input,errors,decoding_%s) 295''' % (encodingname, name, suffix, suffix)] 296 l.append('''\ 297class IncrementalEncoder(codecs.IncrementalEncoder): 298 def encode(self, input, final=False): 299 return codecs.charmap_encode(input,self.errors,encoding_%s)[0] 300 301class IncrementalDecoder(codecs.IncrementalDecoder): 302 def decode(self, input, final=False): 303 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' % 304 (suffix, suffix)) 305 306 l.append(''' 307class StreamWriter(Codec,codecs.StreamWriter): 308 pass 309 310class StreamReader(Codec,codecs.StreamReader): 311 pass 312 313### encodings module API 314 315def getregentry(): 316 return codecs.CodecInfo( 317 name=%r, 318 encode=Codec().encode, 319 decode=Codec().decode, 320 incrementalencoder=IncrementalEncoder, 321 incrementaldecoder=IncrementalDecoder, 322 streamreader=StreamReader, 323 streamwriter=StreamWriter, 324 ) 325''' % encodingname.replace('_', '-')) 326 327 # Add decoding table or map (with preference to the table) 328 if not decoding_table_code: 329 l.append(''' 330### Decoding Map 331''') 332 l.extend(decoding_map_code) 333 else: 334 l.append(''' 335### Decoding Table 336''') 337 l.extend(decoding_table_code) 338 339 # Add encoding map 340 if decoding_table_code: 341 l.append(''' 342### Encoding table 343encoding_table=codecs.charmap_build(decoding_table) 344''') 345 else: 346 l.append(''' 347### Encoding Map 348''') 349 l.extend(encoding_map_code) 350 351 # Final new-line 352 l.append('') 353 354 return '\n'.join(l).expandtabs() 355 356def pymap(name,map,pyfile,encodingname,comments=1): 357 358 code = codegen(name,map,encodingname,comments) 359 f = open(pyfile,'w') 360 f.write(code) 361 f.close() 362 363def marshalmap(name,map,marshalfile): 364 365 d = {} 366 for e,(u,c) in map.items(): 367 d[e] = (u,c) 368 f = open(marshalfile,'wb') 369 marshal.dump(d,f) 370 f.close() 371 372def convertdir(dir, dirprefix='', nameprefix='', comments=1): 373 374 mapnames = os.listdir(dir) 375 for mapname in mapnames: 376 mappathname = os.path.join(dir, mapname) 377 if not os.path.isfile(mappathname): 378 continue 379 name = os.path.split(mapname)[1] 380 name = name.replace('-','_') 381 name = name.split('.')[0] 382 name = name.lower() 383 name = nameprefix + name 384 codefile = name + '.py' 385 marshalfile = name + '.mapping' 386 print 'converting %s to %s and %s' % (mapname, 387 dirprefix + codefile, 388 dirprefix + marshalfile) 389 try: 390 map = readmap(os.path.join(dir,mapname)) 391 if not map: 392 print '* map is empty; skipping' 393 else: 394 pymap(mappathname, map, dirprefix + codefile,name,comments) 395 marshalmap(mappathname, map, dirprefix + marshalfile) 396 except ValueError, why: 397 print '* conversion failed: %s' % why 398 raise 399 400def rewritepythondir(dir, dirprefix='', comments=1): 401 402 mapnames = os.listdir(dir) 403 for mapname in mapnames: 404 if not mapname.endswith('.mapping'): 405 continue 406 name = mapname[:-len('.mapping')] 407 codefile = name + '.py' 408 print 'converting %s to %s' % (mapname, 409 dirprefix + codefile) 410 try: 411 map = marshal.load(open(os.path.join(dir,mapname), 412 'rb')) 413 if not map: 414 print '* map is empty; skipping' 415 else: 416 pymap(mapname, map, dirprefix + codefile,name,comments) 417 except ValueError, why: 418 print '* conversion failed: %s' % why 419 420if __name__ == '__main__': 421 422 import sys 423 if 1: 424 apply(convertdir,tuple(sys.argv[1:])) 425 else: 426 apply(rewritepythondir,tuple(sys.argv[1:])) 427