19cf41d0c9d48e6f7df971ebc596aa4cf269b70bbBenjamin Peterson#!/usr/bin/env python 2a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum""" Utility for parsing HTML entity definitions available from: 3a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 4a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum http://www.w3.org/ as e.g. 5a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum http://www.w3.org/TR/REC-html40/HTMLlat1.ent 6a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 7a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum Input is read from stdin, output is written to stdout in form of a 8a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum Python snippet defining a dictionary "entitydefs" mapping literal 9a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum entity name to character or numeric entity. 10a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 1170c4378dbcfdcbeef6fb3aa348f32ed862fe8eb7Tim Peters Marc-Andre Lemburg, mal@lemburg.com, 1999. 12a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum Use as you like. NO WARRANTIES. 13a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 14a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum""" 15a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumimport re,sys 16a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumimport TextTools 17a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 18a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van RossumentityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') 19a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 20a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumdef parse(text,pos=0,endpos=None): 21a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 22a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum pos = 0 23a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum if endpos is None: 240b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton endpos = len(text) 25a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum d = {} 26a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum while 1: 270b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton m = entityRE.search(text,pos,endpos) 280b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton if not m: 290b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton break 300b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton name,charcode,comment = m.groups() 310b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton d[name] = charcode,comment 320b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton pos = m.end() 33a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum return d 34a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 35a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumdef writefile(f,defs): 36a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 37a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum f.write("entitydefs = {\n") 38a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum items = defs.items() 39a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum items.sort() 40a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum for name,(charcode,comment) in items: 410b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton if charcode[:2] == '&#': 420b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton code = int(charcode[2:-1]) 430b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton if code < 256: 440b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton charcode = "'\%o'" % code 450b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton else: 460b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton charcode = repr(charcode) 470b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton else: 480b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton charcode = repr(charcode) 490b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton comment = TextTools.collapse(comment) 500b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) 51a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum f.write('\n}\n') 52a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum 53a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossumif __name__ == '__main__': 54a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum if len(sys.argv) > 1: 550b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton infile = open(sys.argv[1]) 56a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum else: 570b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton infile = sys.stdin 58a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum if len(sys.argv) > 2: 590b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton outfile = open(sys.argv[2],'w') 60a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum else: 610b7b4b8a229532bb7f634aedf0db81adfeb2ae18Jeremy Hylton outfile = sys.stdout 62a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum text = infile.read() 63a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum defs = parse(text) 64a8b37ad9fff331b0ba32b27dc83c7af43f6c3fa6Guido van Rossum writefile(outfile,defs) 65