19cf41d0c9d48e6f7df971ebc596aa4cf269b70bbBenjamin Peterson#!/usr/bin/env python
2a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum""" Utility for parsing HTML entity definitions available from:
3a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
4a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum      http://www.w3.org/ as e.g.
5a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum      http://www.w3.org/TR/REC-html40/HTMLlat1.ent
6a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
7a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    Input is read from stdin, output is written to stdout in form of a
8a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    Python snippet defining a dictionary "entitydefs" mapping literal
9a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    entity name to character or numeric entity.
10a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
1170c4378dbcfdcbeef6fb3aa348f32ed862fe8eb7Tim Peters    Marc-Andre Lemburg, mal@lemburg.com, 1999.
12a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    Use as you like. NO WARRANTIES.
13a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
14a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum"""
15a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumimport re,sys
16a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumimport TextTools
17a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
18a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van RossumentityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
19a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
20a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumdef parse(text,pos=0,endpos=None):
21a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
22a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    pos = 0
23a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    if endpos is None:
240b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        endpos = len(text)
25a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    d = {}
26a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    while 1:
270b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        m = entityRE.search(text,pos,endpos)
280b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        if not m:
290b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton            break
300b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        name,charcode,comment = m.groups()
310b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        d[name] = charcode,comment
320b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        pos = m.end()
33a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    return d
34a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
35a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumdef writefile(f,defs):
36a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
37a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    f.write("entitydefs = {\n")
38a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    items = defs.items()
39a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    items.sort()
40a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    for name,(charcode,comment) in items:
410b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        if charcode[:2] == '&#':
420b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton            code = int(charcode[2:-1])
430b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton            if code < 256:
440b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton                charcode = "'\%o'" % code
450b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton            else:
460b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton                charcode = repr(charcode)
470b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        else:
480b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton            charcode = repr(charcode)
490b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        comment = TextTools.collapse(comment)
500b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
51a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    f.write('\n}\n')
52a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum
53a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumif __name__ == '__main__':
54a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    if len(sys.argv) > 1:
550b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        infile = open(sys.argv[1])
56a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    else:
570b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        infile = sys.stdin
58a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    if len(sys.argv) > 2:
590b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        outfile = open(sys.argv[2],'w')
60a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    else:
610b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton        outfile = sys.stdout
62a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    text = infile.read()
63a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    defs = parse(text)
64a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum    writefile(outfile,defs)
65