gencodec.py revision 3f767795f6784ca6bf797b055be67fce5bf2fa06
134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum""" Unicode Mapping Parser and Codec Generator.
234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van RossumThis script parses Unicode mapping files as available from the Unicode
4a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburgsite (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburgmodules from them. The codecs use the standard character mapping codec
6a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburgto actually apply the mapping.
734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van RossumSynopsis: gencodec.py dir codec_prefix
934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
1034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van RossumAll files in dir are scanned and those producing non-empty mappings
1134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumwill be written to <codec_prefix><mapname>.py with <mapname> being the
1234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumfirst part of the map's filename ('a' in a.b.c.txt) converted to
1334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumlowercase with hyphens replaced by underscores.
1434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
15bae57a88a6e34fa25ba0a5812171a407dc4fc4b1Fred DrakeThe tool also writes marshalled versions of the mapping tables to the
1634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumsame location (with .mapping extension).
1734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
18bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André LemburgWritten by Marc-Andre Lemburg (mal@lemburg.com).
1934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
2034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg(c) Copyright Guido van Rossum, 2000.
22bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg
23bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André LemburgTable generation:
24c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg(c) Copyright Marc-Andre Lemburg, 2005.
25bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    Licensed to PSF under a Contributor Agreement.
2634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
2734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum"""#"
2834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
29c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburgimport re, os, time, marshal, codecs
3034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
31c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg# Maximum allowed size of charmap tables
32c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André LemburgMAX_TABLE_SIZE = 8192
33c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
34c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg# Standard undefined Unicode code point
35c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André LemburgUNI_UNDEFINED = unichr(0xFFFE)
3634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
3734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van RossummapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
3834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                   '\s+'
3934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
4034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                   '\s*'
4134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                   '(#.+)?')
4234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
4334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumdef parsecodes(codes,
44aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald               len=len, filter=filter,range=range):
4534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
4634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    """ Converts code combinations to either a single code integer
4734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        or a tuple of integers.
4834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
4934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
5034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        ignored.
5134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
5234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        Empty codes or illegal ones are returned as None.
5334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
5434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    """
5534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    if not codes:
5634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        return None
57aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald    l = codes.split('+')
5834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    if len(l) == 1:
59aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald        return int(l[0],16)
6034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    for i in range(len(l)):
6134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        try:
62aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald            l[i] = int(l[i],16)
6334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        except ValueError:
6434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            l[i] = None
6534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    l = filter(lambda x: x is not None, l)
6634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    if len(l) == 1:
6734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        return l[0]
6834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    else:
6934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        return tuple(l)
7034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
71aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwalddef readmap(filename):
7234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
7334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f = open(filename,'r')
7434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    lines = f.readlines()
7534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f.close()
7634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    enc2uni = {}
77a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg    identity = []
78a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg    unmapped = range(256)
79c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
80c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    # UTC mapping tables per convention don't include the identity
81c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    # explicitly mapped to different characters or undefined
83c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    for i in range(32) + [127]:
84c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        identity.append(i)
85c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        unmapped.remove(i)
86c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        enc2uni[i] = (i, 'CONTROL CHARACTER')
87c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
8834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    for line in lines:
89aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald        line = line.strip()
9034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        if not line or line[0] == '#':
9134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            continue
9234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        m = mapRE.match(line)
9334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        if not m:
9434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            #print '* not matched: %s' % repr(line)
9534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            continue
9634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        enc,uni,comment = m.groups()
9734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        enc = parsecodes(enc)
9834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        uni = parsecodes(uni)
99c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if comment is None:
10034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            comment = ''
10134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        else:
102c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            comment = comment[1:].strip()
103a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg        if enc < 256:
104c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            if enc in unmapped:
105c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                unmapped.remove(enc)
106a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg            if enc == uni:
107a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg                identity.append(enc)
108c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            enc2uni[enc] = (uni,comment)
109a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg        else:
11034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            enc2uni[enc] = (uni,comment)
111c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
112a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg    # If there are more identity-mapped entries than unmapped entries,
113771bc377f81f877d9840402aab765c7bd957c79dWalter Dörwald    # it pays to generate an identity dictionary first, and add explicit
114a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg    # mappings to None for the rest
115c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    if len(identity) >= len(unmapped):
116a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg        for enc in unmapped:
117a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg            enc2uni[enc] = (None, "")
118a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg        enc2uni['IDENTITY'] = 256
119a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg
12034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    return enc2uni
12134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
122bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburgdef hexrepr(t, precision=4):
12334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
12434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    if t is None:
12534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        return 'None'
12634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    try:
12734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        len(t)
12834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    except:
129bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        return '0x%0*X' % (precision, t)
130c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    try:
131bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        return '(' + ', '.join(['0x%0*X' % (precision, item)
132bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg                                for item in t]) + ')'
133c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    except TypeError, why:
134c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        print '* failed to convert %r: %s' % (t, why)
135c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        raise
13634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
137bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburgdef python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
13834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
139c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    l = []
140c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    append = l.append
141c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    if map.has_key("IDENTITY"):
142c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        append("%s = codecs.make_identity_dict(range(%d))" %
143c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg               (varname, map["IDENTITY"]))
144c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        append("%s.update({" % varname)
145c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        splits = 1
146c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        del map["IDENTITY"]
147c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        identity = 1
14834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    else:
149c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        append("%s = {" % varname)
150c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        splits = 0
151c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        identity = 0
15234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
153c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    mappings = map.items()
154c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    mappings.sort()
155c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    i = 0
156bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    key_precision, value_precision = precisions
157c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    for mapkey, mapvalue in mappings:
158c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        mapcomment = ''
159c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if isinstance(mapkey, tuple):
160c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            (mapkey, mapcomment) = mapkey
161c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if isinstance(mapvalue, tuple):
162c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            (mapvalue, mapcomment) = mapvalue
163c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if mapkey is None:
164c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            continue
165c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if (identity and
166c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            mapkey == mapvalue and
167c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            mapkey < 256):
168c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            # No need to include identity mappings, since these
169c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            # are already set for the first 256 code points.
170c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            continue
171bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        key = hexrepr(mapkey, key_precision)
172bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        value = hexrepr(mapvalue, value_precision)
173c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if mapcomment and comments:
174c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
175c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        else:
176c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            append('    %s: %s,' % (key, value))
177c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        i += 1
178c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if i == 4096:
179c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            # Split the definition into parts to that the Python
180c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            # parser doesn't dump core
181c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            if splits == 0:
182c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                append('}')
183c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            else:
184c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                append('})')
185c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            append('%s.update({' % varname)
186c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            i = 0
187c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            splits = splits + 1
188c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    if splits == 0:
189c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        append('}')
19034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    else:
191c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        append('})')
192c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
193c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    return l
194c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
195bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburgdef python_tabledef_code(varname, map, comments=1, key_precision=2):
196c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
197c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    l = []
198c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    append = l.append
199c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    append('%s = (' % varname)
200c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
201c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    # Analyze map and create table dict
202c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    mappings = map.items()
203c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    mappings.sort()
204c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    table = {}
205c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    maxkey = 0
206c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    if map.has_key('IDENTITY'):
207c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        for key in range(256):
208c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            table[key] = (key, '')
209c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        maxkey = 255
210c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        del map['IDENTITY']
211c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    for mapkey, mapvalue in mappings:
212c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        mapcomment = ''
213c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if isinstance(mapkey, tuple):
214c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            (mapkey, mapcomment) = mapkey
215c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if isinstance(mapvalue, tuple):
216c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            (mapvalue, mapcomment) = mapvalue
217c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if mapkey is None:
218c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            continue
219c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        table[mapkey] = (mapvalue, mapcomment)
220c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if mapkey > maxkey:
221c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            maxkey = mapkey
222c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    if maxkey > MAX_TABLE_SIZE:
223c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        # Table too large
224c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        return None
225c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
226c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    # Create table code
227c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    for key in range(maxkey + 1):
228c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if key not in table:
229c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            mapvalue = None
230c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            mapcomment = 'UNDEFINED'
231c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        else:
232c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            mapvalue, mapcomment = table[key]
233c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if mapvalue is None:
234c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            mapchar = UNI_UNDEFINED
235c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        else:
236c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            if isinstance(mapvalue, tuple):
237c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                # 1-n mappings not supported
238c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                return None
23934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            else:
240c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                mapchar = unichr(mapvalue)
241c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        if mapcomment and comments:
242c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            append('    %r\t#  %s -> %s' % (mapchar,
243bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg                                            hexrepr(key, key_precision),
244c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg                                            mapcomment))
245c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        else:
246c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            append('    %r' % mapchar)
24734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
248c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    append(')')
249c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    return l
250c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
251abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwalddef codegen(name, map, encodingname, comments=1):
25234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
25334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    """ Returns Python source for the given map.
25434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
25534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        Comments are included in the source, if comments is true (default).
25634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
25734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    """
258c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    # Generate code
259c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    decoding_map_code = python_mapdef_code(
260c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        'decoding_map',
261c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        map,
262c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        comments=comments)
263c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    decoding_table_code = python_tabledef_code(
264c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        'decoding_table',
265c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        map,
266c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        comments=comments)
267c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg    encoding_map_code = python_mapdef_code(
268c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        'encoding_map',
269c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        codecs.make_encoding_map(map),
270bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        comments=comments,
271bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        precisions=(4, 2))
272c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg
2733f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    if decoding_table_code:
2743f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        suffix = 'table'
2753f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    else:
2763f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        suffix = 'map'
2773f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis
27834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    l = [
27934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        '''\
280abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
28134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
28234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum"""#"
28334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
28434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumimport codecs
28534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
28634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum### Codec APIs
28734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
28834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumclass Codec(codecs.Codec):
28934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
29034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    def encode(self,input,errors='strict'):
2913f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        return codecs.charmap_encode(input,errors,encoding_%s)
292536cf99536bce562cfcb44a856fac1c84b9de4c3Tim Peters
2933f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    def decode(self,input,errors='strict'):
2943f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        return codecs.charmap_decode(input,errors,decoding_%s)
2953f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis''' % (encodingname, name, suffix, suffix)]
2963f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    l.append('''\
297abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwaldclass IncrementalEncoder(codecs.IncrementalEncoder):
298abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald    def encode(self, input, final=False):
2993f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
300abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald
301abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwaldclass IncrementalDecoder(codecs.IncrementalDecoder):
3023f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    def decode(self, input, final=False):
3033f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
3043f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        (suffix, suffix))
305abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald
306abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald    l.append('''
30734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumclass StreamWriter(Codec,codecs.StreamWriter):
30834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    pass
30970c4378dbcfdcbeef6fb3aa348f32ed862fe8eb7Tim Peters
31034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumclass StreamReader(Codec,codecs.StreamReader):
31134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    pass
31234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
31334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum### encodings module API
31434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
31534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumdef getregentry():
316df676c5ffdbc1c7cfa11d759936bfddebb53067eJack Diederich    return codecs.CodecInfo(
317df676c5ffdbc1c7cfa11d759936bfddebb53067eJack Diederich        name=%r,
3183f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        encode=Codec().encode,
3193f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        decode=Codec().decode,
320abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald        incrementalencoder=IncrementalEncoder,
321abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald        incrementaldecoder=IncrementalDecoder,
3223f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        streamreader=StreamReader,
3233f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        streamwriter=StreamWriter,
324df676c5ffdbc1c7cfa11d759936bfddebb53067eJack Diederich    )
325abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald''' % encodingname.replace('_', '-'))
32634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
327bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    # Add decoding table or map (with preference to the table)
328bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    if not decoding_table_code:
329bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        l.append('''
33034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum### Decoding Map
331c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg''')
332bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        l.extend(decoding_map_code)
333bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    else:
334c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        l.append('''
335c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg### Decoding Table
336c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg''')
337c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        l.extend(decoding_table_code)
33870c4378dbcfdcbeef6fb3aa348f32ed862fe8eb7Tim Peters
339bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    # Add encoding map
3403f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    if decoding_table_code:
3413f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        l.append('''
3423f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis### Encoding table
3433f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwisencoding_table=codecs.charmap_build(decoding_table)
3443f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis''')
3453f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis    else:
3463f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        l.append('''
34734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum### Encoding Map
34834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum''')
3493f767795f6784ca6bf797b055be67fce5bf2fa06Martin v. Löwis        l.extend(encoding_map_code)
350bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg
351bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg    # Final new-line
3525d23f9a8a33324d00d1e73f40f7c07e914b295f1Walter Dörwald    l.append('')
353536cf99536bce562cfcb44a856fac1c84b9de4c3Tim Peters
354abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald    return '\n'.join(l).expandtabs()
35534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
356abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwalddef pymap(name,map,pyfile,encodingname,comments=1):
35734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
358abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald    code = codegen(name,map,encodingname,comments)
35934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f = open(pyfile,'w')
36034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f.write(code)
36134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f.close()
36234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
36334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumdef marshalmap(name,map,marshalfile):
36434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
36534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    d = {}
36634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    for e,(u,c) in map.items():
36734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        d[e] = (u,c)
36834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f = open(marshalfile,'wb')
36934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    marshal.dump(d,f)
37034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    f.close()
37134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
372abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwalddef convertdir(dir, dirprefix='', nameprefix='', comments=1):
37334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
37434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    mapnames = os.listdir(dir)
37534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    for mapname in mapnames:
376c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        mappathname = os.path.join(dir, mapname)
377bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg        if not os.path.isfile(mappathname):
378bd20ea55bc7a044a773e6824f7fcef4f5669d44cMarc-André Lemburg            continue
37934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        name = os.path.split(mapname)[1]
380aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald        name = name.replace('-','_')
381aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald        name = name.split('.')[0]
382aaab30e00cc3e8d90c71b8657c284feeb4ac1413Walter Dörwald        name = name.lower()
383abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald        name = nameprefix + name
38434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        codefile = name + '.py'
38534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        marshalfile = name + '.mapping'
38634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        print 'converting %s to %s and %s' % (mapname,
387abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald                                              dirprefix + codefile,
388abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald                                              dirprefix + marshalfile)
38934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        try:
39034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            map = readmap(os.path.join(dir,mapname))
39134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            if not map:
39234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                print '* map is empty; skipping'
39334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            else:
394abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald                pymap(mappathname, map, dirprefix + codefile,name,comments)
395abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald                marshalmap(mappathname, map, dirprefix + marshalfile)
396c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg        except ValueError, why:
397c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            print '* conversion failed: %s' % why
398c5694c8bf4bf2008b42e0107fb245415df4147fdMarc-André Lemburg            raise
39934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
400abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwalddef rewritepythondir(dir, dirprefix='', comments=1):
40170c4378dbcfdcbeef6fb3aa348f32ed862fe8eb7Tim Peters
40234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    mapnames = os.listdir(dir)
40334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    for mapname in mapnames:
404a866df806dd0ffd439bbba873ab9f3da7080e0a0Marc-André Lemburg        if not mapname.endswith('.mapping'):
40534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            continue
406abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald        name = mapname[:-len('.mapping')]
407abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald        codefile = name + '.py'
40834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        print 'converting %s to %s' % (mapname,
409abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald                                       dirprefix + codefile)
41034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        try:
41134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            map = marshal.load(open(os.path.join(dir,mapname),
41234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                               'rb'))
41334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            if not map:
41434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum                print '* map is empty; skipping'
41534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            else:
416abb02e59946f9ea3076e96e3b03b51d1cebd46b4Walter Dörwald                pymap(mapname, map, dirprefix + codefile,name,comments)
41734a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        except ValueError, why:
41834a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum            print '* conversion failed: %s' % why
41934a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
42034a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossumif __name__ == '__main__':
42134a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum
42234a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    import sys
42334a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    if 1:
42434a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        apply(convertdir,tuple(sys.argv[1:]))
42534a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum    else:
42634a79115c5d5be53581f49ced5a5a17171cabb7dGuido van Rossum        apply(rewritepythondir,tuple(sys.argv[1:]))
427