parseentities.py revision a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6
1#!/usr/local/bin/python
2""" Utility for parsing HTML entity definitions available from:
3
4      http://www.w3.org/ as e.g.
5      http://www.w3.org/TR/REC-html40/HTMLlat1.ent
6
7    Input is read from stdin, output is written to stdout in form of a
8    Python snippet defining a dictionary "entitydefs" mapping literal
9    entity name to character or numeric entity.
10
11    Marc-Andre Lemburg, mal@lemburg.com, 1999.
12    Use as you like. NO WARRANTIES.
13
14"""
15import re,sys
16import TextTools
17
18entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
19
20def parse(text,pos=0,endpos=None):
21
22    pos = 0
23    if endpos is None:
24	endpos = len(text)
25    d = {}
26    while 1:
27	m = entityRE.search(text,pos,endpos)
28	if not m:
29	    break
30	name,charcode,comment = m.groups()
31	d[name] = charcode,comment
32	pos = m.end()
33    return d
34
35def writefile(f,defs):
36
37    f.write("entitydefs = {\n")
38    items = defs.items()
39    items.sort()
40    for name,(charcode,comment) in items:
41	if charcode[:2] == '&#':
42	    code = int(charcode[2:-1])
43	    if code < 256:
44		charcode = "'\%o'" % code
45	    else:
46		charcode = repr(charcode)
47	else:
48	    charcode = repr(charcode)
49	comment = TextTools.collapse(comment)
50	f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
51    f.write('\n}\n')
52
53if __name__ == '__main__':
54    if len(sys.argv) > 1:
55	infile = open(sys.argv[1])
56    else:
57	infile = sys.stdin
58    if len(sys.argv) > 2:
59	outfile = open(sys.argv[2],'w')
60    else:
61	outfile = sys.stdout
62    text = infile.read()
63    defs = parse(text)
64    writefile(outfile,defs)
65
66