gencodec.py revision 3f767795f6784ca6bf797b055be67fce5bf2fa06
1""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
22
23Table generation:
24(c) Copyright Marc-Andre Lemburg, 2005.
25    Licensed to PSF under a Contributor Agreement.
26
27"""#"
28
29import re, os, time, marshal, codecs
30
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
35UNI_UNDEFINED = unichr(0xFFFE)
36
37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38                   '\s+'
39                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40                   '\s*'
41                   '(#.+)?')
42
43def parsecodes(codes,
44               len=len, filter=filter,range=range):
45
46    """ Converts code combinations to either a single code integer
47        or a tuple of integers.
48
49        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50        ignored.
51
52        Empty codes or illegal ones are returned as None.
53
54    """
55    if not codes:
56        return None
57    l = codes.split('+')
58    if len(l) == 1:
59        return int(l[0],16)
60    for i in range(len(l)):
61        try:
62            l[i] = int(l[i],16)
63        except ValueError:
64            l[i] = None
65    l = filter(lambda x: x is not None, l)
66    if len(l) == 1:
67        return l[0]
68    else:
69        return tuple(l)
70
71def readmap(filename):
72
73    f = open(filename,'r')
74    lines = f.readlines()
75    f.close()
76    enc2uni = {}
77    identity = []
78    unmapped = range(256)
79
80    # UTC mapping tables per convention don't include the identity
81    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82    # explicitly mapped to different characters or undefined
83    for i in range(32) + [127]:
84        identity.append(i)
85        unmapped.remove(i)
86        enc2uni[i] = (i, 'CONTROL CHARACTER')
87
88    for line in lines:
89        line = line.strip()
90        if not line or line[0] == '#':
91            continue
92        m = mapRE.match(line)
93        if not m:
94            #print '* not matched: %s' % repr(line)
95            continue
96        enc,uni,comment = m.groups()
97        enc = parsecodes(enc)
98        uni = parsecodes(uni)
99        if comment is None:
100            comment = ''
101        else:
102            comment = comment[1:].strip()
103        if enc < 256:
104            if enc in unmapped:
105                unmapped.remove(enc)
106            if enc == uni:
107                identity.append(enc)
108            enc2uni[enc] = (uni,comment)
109        else:
110            enc2uni[enc] = (uni,comment)
111
112    # If there are more identity-mapped entries than unmapped entries,
113    # it pays to generate an identity dictionary first, and add explicit
114    # mappings to None for the rest
115    if len(identity) >= len(unmapped):
116        for enc in unmapped:
117            enc2uni[enc] = (None, "")
118        enc2uni['IDENTITY'] = 256
119
120    return enc2uni
121
122def hexrepr(t, precision=4):
123
124    if t is None:
125        return 'None'
126    try:
127        len(t)
128    except:
129        return '0x%0*X' % (precision, t)
130    try:
131        return '(' + ', '.join(['0x%0*X' % (precision, item)
132                                for item in t]) + ')'
133    except TypeError, why:
134        print '* failed to convert %r: %s' % (t, why)
135        raise
136
137def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
138
139    l = []
140    append = l.append
141    if map.has_key("IDENTITY"):
142        append("%s = codecs.make_identity_dict(range(%d))" %
143               (varname, map["IDENTITY"]))
144        append("%s.update({" % varname)
145        splits = 1
146        del map["IDENTITY"]
147        identity = 1
148    else:
149        append("%s = {" % varname)
150        splits = 0
151        identity = 0
152
153    mappings = map.items()
154    mappings.sort()
155    i = 0
156    key_precision, value_precision = precisions
157    for mapkey, mapvalue in mappings:
158        mapcomment = ''
159        if isinstance(mapkey, tuple):
160            (mapkey, mapcomment) = mapkey
161        if isinstance(mapvalue, tuple):
162            (mapvalue, mapcomment) = mapvalue
163        if mapkey is None:
164            continue
165        if (identity and
166            mapkey == mapvalue and
167            mapkey < 256):
168            # No need to include identity mappings, since these
169            # are already set for the first 256 code points.
170            continue
171        key = hexrepr(mapkey, key_precision)
172        value = hexrepr(mapvalue, value_precision)
173        if mapcomment and comments:
174            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
175        else:
176            append('    %s: %s,' % (key, value))
177        i += 1
178        if i == 4096:
179            # Split the definition into parts to that the Python
180            # parser doesn't dump core
181            if splits == 0:
182                append('}')
183            else:
184                append('})')
185            append('%s.update({' % varname)
186            i = 0
187            splits = splits + 1
188    if splits == 0:
189        append('}')
190    else:
191        append('})')
192
193    return l
194
195def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197    l = []
198    append = l.append
199    append('%s = (' % varname)
200
201    # Analyze map and create table dict
202    mappings = map.items()
203    mappings.sort()
204    table = {}
205    maxkey = 0
206    if map.has_key('IDENTITY'):
207        for key in range(256):
208            table[key] = (key, '')
209        maxkey = 255
210        del map['IDENTITY']
211    for mapkey, mapvalue in mappings:
212        mapcomment = ''
213        if isinstance(mapkey, tuple):
214            (mapkey, mapcomment) = mapkey
215        if isinstance(mapvalue, tuple):
216            (mapvalue, mapcomment) = mapvalue
217        if mapkey is None:
218            continue
219        table[mapkey] = (mapvalue, mapcomment)
220        if mapkey > maxkey:
221            maxkey = mapkey
222    if maxkey > MAX_TABLE_SIZE:
223        # Table too large
224        return None
225
226    # Create table code
227    for key in range(maxkey + 1):
228        if key not in table:
229            mapvalue = None
230            mapcomment = 'UNDEFINED'
231        else:
232            mapvalue, mapcomment = table[key]
233        if mapvalue is None:
234            mapchar = UNI_UNDEFINED
235        else:
236            if isinstance(mapvalue, tuple):
237                # 1-n mappings not supported
238                return None
239            else:
240                mapchar = unichr(mapvalue)
241        if mapcomment and comments:
242            append('    %r\t#  %s -> %s' % (mapchar,
243                                            hexrepr(key, key_precision),
244                                            mapcomment))
245        else:
246            append('    %r' % mapchar)
247
248    append(')')
249    return l
250
251def codegen(name, map, encodingname, comments=1):
252
253    """ Returns Python source for the given map.
254
255        Comments are included in the source, if comments is true (default).
256
257    """
258    # Generate code
259    decoding_map_code = python_mapdef_code(
260        'decoding_map',
261        map,
262        comments=comments)
263    decoding_table_code = python_tabledef_code(
264        'decoding_table',
265        map,
266        comments=comments)
267    encoding_map_code = python_mapdef_code(
268        'encoding_map',
269        codecs.make_encoding_map(map),
270        comments=comments,
271        precisions=(4, 2))
272
273    if decoding_table_code:
274        suffix = 'table'
275    else:
276        suffix = 'map'
277
278    l = [
279        '''\
280""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
281
282"""#"
283
284import codecs
285
286### Codec APIs
287
288class Codec(codecs.Codec):
289
290    def encode(self,input,errors='strict'):
291        return codecs.charmap_encode(input,errors,encoding_%s)
292
293    def decode(self,input,errors='strict'):
294        return codecs.charmap_decode(input,errors,decoding_%s)
295''' % (encodingname, name, suffix, suffix)]
296    l.append('''\
297class IncrementalEncoder(codecs.IncrementalEncoder):
298    def encode(self, input, final=False):
299        return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
300
301class IncrementalDecoder(codecs.IncrementalDecoder):
302    def decode(self, input, final=False):
303        return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
304        (suffix, suffix))
305
306    l.append('''
307class StreamWriter(Codec,codecs.StreamWriter):
308    pass
309
310class StreamReader(Codec,codecs.StreamReader):
311    pass
312
313### encodings module API
314
315def getregentry():
316    return codecs.CodecInfo(
317        name=%r,
318        encode=Codec().encode,
319        decode=Codec().decode,
320        incrementalencoder=IncrementalEncoder,
321        incrementaldecoder=IncrementalDecoder,
322        streamreader=StreamReader,
323        streamwriter=StreamWriter,
324    )
325''' % encodingname.replace('_', '-'))
326
327    # Add decoding table or map (with preference to the table)
328    if not decoding_table_code:
329        l.append('''
330### Decoding Map
331''')
332        l.extend(decoding_map_code)
333    else:
334        l.append('''
335### Decoding Table
336''')
337        l.extend(decoding_table_code)
338
339    # Add encoding map
340    if decoding_table_code:
341        l.append('''
342### Encoding table
343encoding_table=codecs.charmap_build(decoding_table)
344''')
345    else:
346        l.append('''
347### Encoding Map
348''')
349        l.extend(encoding_map_code)
350
351    # Final new-line
352    l.append('')
353
354    return '\n'.join(l).expandtabs()
355
356def pymap(name,map,pyfile,encodingname,comments=1):
357
358    code = codegen(name,map,encodingname,comments)
359    f = open(pyfile,'w')
360    f.write(code)
361    f.close()
362
363def marshalmap(name,map,marshalfile):
364
365    d = {}
366    for e,(u,c) in map.items():
367        d[e] = (u,c)
368    f = open(marshalfile,'wb')
369    marshal.dump(d,f)
370    f.close()
371
372def convertdir(dir, dirprefix='', nameprefix='', comments=1):
373
374    mapnames = os.listdir(dir)
375    for mapname in mapnames:
376        mappathname = os.path.join(dir, mapname)
377        if not os.path.isfile(mappathname):
378            continue
379        name = os.path.split(mapname)[1]
380        name = name.replace('-','_')
381        name = name.split('.')[0]
382        name = name.lower()
383        name = nameprefix + name
384        codefile = name + '.py'
385        marshalfile = name + '.mapping'
386        print 'converting %s to %s and %s' % (mapname,
387                                              dirprefix + codefile,
388                                              dirprefix + marshalfile)
389        try:
390            map = readmap(os.path.join(dir,mapname))
391            if not map:
392                print '* map is empty; skipping'
393            else:
394                pymap(mappathname, map, dirprefix + codefile,name,comments)
395                marshalmap(mappathname, map, dirprefix + marshalfile)
396        except ValueError, why:
397            print '* conversion failed: %s' % why
398            raise
399
400def rewritepythondir(dir, dirprefix='', comments=1):
401
402    mapnames = os.listdir(dir)
403    for mapname in mapnames:
404        if not mapname.endswith('.mapping'):
405            continue
406        name = mapname[:-len('.mapping')]
407        codefile = name + '.py'
408        print 'converting %s to %s' % (mapname,
409                                       dirprefix + codefile)
410        try:
411            map = marshal.load(open(os.path.join(dir,mapname),
412                               'rb'))
413            if not map:
414                print '* map is empty; skipping'
415            else:
416                pymap(mapname, map, dirprefix + codefile,name,comments)
417        except ValueError, why:
418            print '* conversion failed: %s' % why
419
420if __name__ == '__main__':
421
422    import sys
423    if 1:
424        apply(convertdir,tuple(sys.argv[1:]))
425    else:
426        apply(rewritepythondir,tuple(sys.argv[1:]))
427