gencodec.py revision bd20ea55bc7a044a773e6824f7fcef4f5669d44c
1""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
22
23Table generation:
24(c) Copyright Marc-Andre Lemburg, 2005.
25    Licensed to PSF under a Contributor Agreement.
26
27"""#"
28
29import re, os, time, marshal, codecs
30
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
35UNI_UNDEFINED = unichr(0xFFFE)
36
37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38                   '\s+'
39                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40                   '\s*'
41                   '(#.+)?')
42
43def parsecodes(codes,
44               len=len, filter=filter,range=range):
45
46    """ Converts code combinations to either a single code integer
47        or a tuple of integers.
48
49        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50        ignored.
51
52        Empty codes or illegal ones are returned as None.
53
54    """
55    if not codes:
56        return None
57    l = codes.split('+')
58    if len(l) == 1:
59        return int(l[0],16)
60    for i in range(len(l)):
61        try:
62            l[i] = int(l[i],16)
63        except ValueError:
64            l[i] = None
65    l = filter(lambda x: x is not None, l)
66    if len(l) == 1:
67        return l[0]
68    else:
69        return tuple(l)
70
71def readmap(filename):
72
73    f = open(filename,'r')
74    lines = f.readlines()
75    f.close()
76    enc2uni = {}
77    identity = []
78    unmapped = range(256)
79
80    # UTC mapping tables per convention don't include the identity
81    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82    # explicitly mapped to different characters or undefined
83    for i in range(32) + [127]:
84        identity.append(i)
85        unmapped.remove(i)
86        enc2uni[i] = (i, 'CONTROL CHARACTER')
87
88    for line in lines:
89        line = line.strip()
90        if not line or line[0] == '#':
91            continue
92        m = mapRE.match(line)
93        if not m:
94            #print '* not matched: %s' % repr(line)
95            continue
96        enc,uni,comment = m.groups()
97        enc = parsecodes(enc)
98        uni = parsecodes(uni)
99        if comment is None:
100            comment = ''
101        else:
102            comment = comment[1:].strip()
103        if enc < 256:
104            if enc in unmapped:
105                unmapped.remove(enc)
106            if enc == uni:
107                identity.append(enc)
108            enc2uni[enc] = (uni,comment)
109        else:
110            enc2uni[enc] = (uni,comment)
111
112    # If there are more identity-mapped entries than unmapped entries,
113    # it pays to generate an identity dictionary first, and add explicit
114    # mappings to None for the rest
115    if len(identity) >= len(unmapped):
116        for enc in unmapped:
117            enc2uni[enc] = (None, "")
118        enc2uni['IDENTITY'] = 256
119
120    return enc2uni
121
122def hexrepr(t, precision=4):
123
124    if t is None:
125        return 'None'
126    try:
127        len(t)
128    except:
129        return '0x%0*X' % (precision, t)
130    try:
131        return '(' + ', '.join(['0x%0*X' % (precision, item)
132                                for item in t]) + ')'
133    except TypeError, why:
134        print '* failed to convert %r: %s' % (t, why)
135        raise
136
137def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
138
139    l = []
140    append = l.append
141    if map.has_key("IDENTITY"):
142        append("%s = codecs.make_identity_dict(range(%d))" %
143               (varname, map["IDENTITY"]))
144        append("%s.update({" % varname)
145        splits = 1
146        del map["IDENTITY"]
147        identity = 1
148    else:
149        append("%s = {" % varname)
150        splits = 0
151        identity = 0
152
153    mappings = map.items()
154    mappings.sort()
155    i = 0
156    key_precision, value_precision = precisions
157    for mapkey, mapvalue in mappings:
158        mapcomment = ''
159        if isinstance(mapkey, tuple):
160            (mapkey, mapcomment) = mapkey
161        if isinstance(mapvalue, tuple):
162            (mapvalue, mapcomment) = mapvalue
163        if mapkey is None:
164            continue
165        if (identity and
166            mapkey == mapvalue and
167            mapkey < 256):
168            # No need to include identity mappings, since these
169            # are already set for the first 256 code points.
170            continue
171        key = hexrepr(mapkey, key_precision)
172        value = hexrepr(mapvalue, value_precision)
173        if mapcomment and comments:
174            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
175        else:
176            append('    %s: %s,' % (key, value))
177        i += 1
178        if i == 4096:
179            # Split the definition into parts to that the Python
180            # parser doesn't dump core
181            if splits == 0:
182                append('}')
183            else:
184                append('})')
185            append('%s.update({' % varname)
186            i = 0
187            splits = splits + 1
188    if splits == 0:
189        append('}')
190    else:
191        append('})')
192
193    return l
194
195def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197    l = []
198    append = l.append
199    append('%s = (' % varname)
200
201    # Analyze map and create table dict
202    mappings = map.items()
203    mappings.sort()
204    table = {}
205    maxkey = 0
206    if map.has_key('IDENTITY'):
207        for key in range(256):
208            table[key] = (key, '')
209        maxkey = 255
210        del map['IDENTITY']
211    for mapkey, mapvalue in mappings:
212        mapcomment = ''
213        if isinstance(mapkey, tuple):
214            (mapkey, mapcomment) = mapkey
215        if isinstance(mapvalue, tuple):
216            (mapvalue, mapcomment) = mapvalue
217        if mapkey is None:
218            continue
219        table[mapkey] = (mapvalue, mapcomment)
220        if mapkey > maxkey:
221            maxkey = mapkey
222    if maxkey > MAX_TABLE_SIZE:
223        # Table too large
224        return None
225
226    # Create table code
227    for key in range(maxkey + 1):
228        if key not in table:
229            mapvalue = None
230            mapcomment = 'UNDEFINED'
231        else:
232            mapvalue, mapcomment = table[key]
233        if mapvalue is None:
234            mapchar = UNI_UNDEFINED
235        else:
236            if isinstance(mapvalue, tuple):
237                # 1-n mappings not supported
238                return None
239            else:
240                mapchar = unichr(mapvalue)
241        if mapcomment and comments:
242            append('    %r\t#  %s -> %s' % (mapchar,
243                                            hexrepr(key, key_precision),
244                                            mapcomment))
245        else:
246            append('    %r' % mapchar)
247
248    append(')')
249    return l
250
251def codegen(name, map, comments=1):
252
253    """ Returns Python source for the given map.
254
255        Comments are included in the source, if comments is true (default).
256
257    """
258    # Generate code
259    decoding_map_code = python_mapdef_code(
260        'decoding_map',
261        map,
262        comments=comments)
263    decoding_table_code = python_tabledef_code(
264        'decoding_table',
265        map,
266        comments=comments)
267    encoding_map_code = python_mapdef_code(
268        'encoding_map',
269        codecs.make_encoding_map(map),
270        comments=comments,
271        precisions=(4, 2))
272
273    l = [
274        '''\
275""" Python Character Mapping Codec generated from '%s' with gencodec.py.
276
277"""#"
278
279import codecs
280
281### Codec APIs
282
283class Codec(codecs.Codec):
284
285    def encode(self,input,errors='strict'):
286
287        return codecs.charmap_encode(input,errors,encoding_map)
288
289    def decode(self,input,errors='strict'):
290''' % name
291        ]
292    if decoding_table_code:
293        l.append('''\
294        return codecs.charmap_decode(input,errors,decoding_table)''')
295    else:
296        l.append('''\
297        return codecs.charmap_decode(input,errors,decoding_map)''')
298
299    l.append('''
300class StreamWriter(Codec,codecs.StreamWriter):
301    pass
302
303class StreamReader(Codec,codecs.StreamReader):
304    pass
305
306### encodings module API
307
308def getregentry():
309
310    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
311''')
312
313    # Add decoding table or map (with preference to the table)
314    if not decoding_table_code:
315        l.append('''
316### Decoding Map
317''')
318        l.extend(decoding_map_code)
319    else:
320        l.append('''
321### Decoding Table
322''')
323        l.extend(decoding_table_code)
324
325    # Add encoding map
326    l.append('''
327### Encoding Map
328''')
329    l.extend(encoding_map_code)
330
331    # Final new-line
332    l.append('\n')
333
334    return '\n'.join(l)
335
336def pymap(name,map,pyfile,comments=1):
337
338    code = codegen(name,map,comments)
339    f = open(pyfile,'w')
340    f.write(code)
341    f.close()
342
343def marshalmap(name,map,marshalfile):
344
345    d = {}
346    for e,(u,c) in map.items():
347        d[e] = (u,c)
348    f = open(marshalfile,'wb')
349    marshal.dump(d,f)
350    f.close()
351
352def convertdir(dir,prefix='',comments=1):
353
354    mapnames = os.listdir(dir)
355    for mapname in mapnames:
356        mappathname = os.path.join(dir, mapname)
357        if not os.path.isfile(mappathname):
358            continue
359        name = os.path.split(mapname)[1]
360        name = name.replace('-','_')
361        name = name.split('.')[0]
362        name = name.lower()
363        codefile = name + '.py'
364        marshalfile = name + '.mapping'
365        print 'converting %s to %s and %s' % (mapname,
366                                              prefix + codefile,
367                                              prefix + marshalfile)
368        try:
369            map = readmap(os.path.join(dir,mapname))
370            if not map:
371                print '* map is empty; skipping'
372            else:
373                pymap(mappathname, map, prefix + codefile,comments)
374                marshalmap(mappathname, map, prefix + marshalfile)
375        except ValueError, why:
376            print '* conversion failed: %s' % why
377            raise
378
379def rewritepythondir(dir,prefix='',comments=1):
380
381    mapnames = os.listdir(dir)
382    for mapname in mapnames:
383        if not mapname.endswith('.mapping'):
384            continue
385        codefile = mapname[:-len('.mapping')] + '.py'
386        print 'converting %s to %s' % (mapname,
387                                       prefix + codefile)
388        try:
389            map = marshal.load(open(os.path.join(dir,mapname),
390                               'rb'))
391            if not map:
392                print '* map is empty; skipping'
393            else:
394                pymap(mapname, map, prefix + codefile,comments)
395        except ValueError, why:
396            print '* conversion failed: %s' % why
397
398if __name__ == '__main__':
399
400    import sys
401    if 1:
402        apply(convertdir,tuple(sys.argv[1:]))
403    else:
404        apply(rewritepythondir,tuple(sys.argv[1:]))
405