HTMLParser.py revision 5a88853bdc1074e62441c7558502bd989c39f056
1d995e1150cab57ed7c885d4b7dd943495022936bFred Drake"""A parser for HTML and XHTML.""" 2d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 3d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# This file is based on sgmllib.py, but the API is slightly different. 4d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 5d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# XXX There should be a way to distinguish between PCDATA (parsed 6d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# character data -- the normal case), RCDATA (replaceable character 7d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# data -- only char and entity references and end tags are special) 8d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# and CDATA (character data -- only end tags are special). 9d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 10d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 11d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeimport markupbase 12d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeimport re 13d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 14d995e1150cab57ed7c885d4b7dd943495022936bFred Drake# Regular expressions used for parsing 15d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 16d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeinteresting_normal = re.compile('[&<]') 17d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeincomplete = re.compile('&[a-zA-Z#]') 18d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 19d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 20d995e1150cab57ed7c885d4b7dd943495022936bFred Drakecharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 21d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 22d995e1150cab57ed7c885d4b7dd943495022936bFred Drakestarttagopen = re.compile('<[a-zA-Z]') 23d995e1150cab57ed7c885d4b7dd943495022936bFred Drakepiclose = re.compile('>') 24d995e1150cab57ed7c885d4b7dd943495022936bFred Drakecommentclose = re.compile(r'--\s*>') 25b8147452265077d4c12464a9943903f0d040f79cEzio Melotti 26f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 27f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 28b8147452265077d4c12464a9943903f0d040f79cEzio Melotti# note: if you change tagfind/attrfind remember to update locatestarttagend too 29b8147452265077d4c12464a9943903f0d040f79cEzio Melottitagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 30b8147452265077d4c12464a9943903f0d040f79cEzio Melotti# this regex is currently unused, but left for backward compatibility 31f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melottitagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 320f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti 33d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeattrfind = re.compile( 34c45868ec697a70a80d1cf8a511894f073fda3a27Ezio Melotti r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 3536b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 36d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 37d995e1150cab57ed7c885d4b7dd943495022936bFred Drakelocatestarttagend = re.compile(r""" 38b8147452265077d4c12464a9943903f0d040f79cEzio Melotti <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 3936b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti (?:[\s/]* # optional whitespace before attribute name 4036b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 410f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti (?:\s*=+\s* # value indicator 42d995e1150cab57ed7c885d4b7dd943495022936bFred Drake (?:'[^']*' # LITA-enclosed value 430f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti |"[^"]*" # LIT-enclosed value 440f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti |(?!['"])[^>\s]* # bare value 45d995e1150cab57ed7c885d4b7dd943495022936bFred Drake ) 4636b7361fe76733b3a4944ef92b49bcea4584b740Ezio Melotti )?(?:\s|/(?!>))* 470f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti )* 480f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti )? 49d995e1150cab57ed7c885d4b7dd943495022936bFred Drake \s* # trailing whitespace 50d995e1150cab57ed7c885d4b7dd943495022936bFred Drake""", re.VERBOSE) 51d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeendendtag = re.compile('>') 527e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 537e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti# </ and the tag name, so maybe this should be fixed 54d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeendtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 55d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 56d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 57d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeclass HTMLParseError(Exception): 58d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """Exception raised for all parse errors.""" 59d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 60d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def __init__(self, msg, position=(None, None)): 61d995e1150cab57ed7c885d4b7dd943495022936bFred Drake assert msg 62d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.msg = msg 63d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.lineno = position[0] 64d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.offset = position[1] 65d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 66d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def __str__(self): 67d995e1150cab57ed7c885d4b7dd943495022936bFred Drake result = self.msg 68d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if self.lineno is not None: 69d995e1150cab57ed7c885d4b7dd943495022936bFred Drake result = result + ", at line %d" % self.lineno 70d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if self.offset is not None: 71d995e1150cab57ed7c885d4b7dd943495022936bFred Drake result = result + ", column %d" % (self.offset + 1) 72d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return result 73d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 74d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 75d995e1150cab57ed7c885d4b7dd943495022936bFred Drakeclass HTMLParser(markupbase.ParserBase): 76d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """Find tags and other markup and call handler functions. 77d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 78d995e1150cab57ed7c885d4b7dd943495022936bFred Drake Usage: 79d995e1150cab57ed7c885d4b7dd943495022936bFred Drake p = HTMLParser() 80d995e1150cab57ed7c885d4b7dd943495022936bFred Drake p.feed(data) 81d995e1150cab57ed7c885d4b7dd943495022936bFred Drake ... 82d995e1150cab57ed7c885d4b7dd943495022936bFred Drake p.close() 83d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 84d995e1150cab57ed7c885d4b7dd943495022936bFred Drake Start tags are handled by calling self.handle_starttag() or 85d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_startendtag(); end tags by self.handle_endtag(). The 86d995e1150cab57ed7c885d4b7dd943495022936bFred Drake data between tags is passed from the parser to the derived class 87d995e1150cab57ed7c885d4b7dd943495022936bFred Drake by calling self.handle_data() with the data as argument (the data 88d995e1150cab57ed7c885d4b7dd943495022936bFred Drake may be split up in arbitrary chunks). Entity references are 89d995e1150cab57ed7c885d4b7dd943495022936bFred Drake passed by calling self.handle_entityref() with the entity 90d995e1150cab57ed7c885d4b7dd943495022936bFred Drake reference as the argument. Numeric character references are 91d995e1150cab57ed7c885d4b7dd943495022936bFred Drake passed to self.handle_charref() with the string containing the 92d995e1150cab57ed7c885d4b7dd943495022936bFred Drake reference as the argument. 93d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """ 94d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 95d995e1150cab57ed7c885d4b7dd943495022936bFred Drake CDATA_CONTENT_ELEMENTS = ("script", "style") 96d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 97d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 98d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def __init__(self): 99d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """Initialize and reset this instance.""" 100d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.reset() 101d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 102d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def reset(self): 103d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """Reset this instance. Loses all unprocessed data.""" 104d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.rawdata = '' 105d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.lasttag = '???' 106d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.interesting = interesting_normal 1077e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti self.cdata_elem = None 108d995e1150cab57ed7c885d4b7dd943495022936bFred Drake markupbase.ParserBase.reset(self) 109d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 110d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def feed(self, data): 11131890bc9ba7c46ed5bcdb91de39f7251badca8b1Éric Araujo r"""Feed data to the parser. 112d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 113d995e1150cab57ed7c885d4b7dd943495022936bFred Drake Call this as often as you want, with as little or as much text 114d995e1150cab57ed7c885d4b7dd943495022936bFred Drake as you want (may include '\n'). 115d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """ 116d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.rawdata = self.rawdata + data 117d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.goahead(0) 118d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 119d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def close(self): 120d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """Handle any buffered data.""" 121d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.goahead(1) 122d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 123d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def error(self, message): 124d995e1150cab57ed7c885d4b7dd943495022936bFred Drake raise HTMLParseError(message, self.getpos()) 125d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 126d995e1150cab57ed7c885d4b7dd943495022936bFred Drake __starttag_text = None 127d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 128d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def get_starttag_text(self): 129d995e1150cab57ed7c885d4b7dd943495022936bFred Drake """Return full source of start tag: '<...>'.""" 130d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return self.__starttag_text 131d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 1327e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti def set_cdata_mode(self, elem): 1337e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti self.cdata_elem = elem.lower() 13400dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 135d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 136d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def clear_cdata_mode(self): 137d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.interesting = interesting_normal 1387e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti self.cdata_elem = None 139d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 140d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Internal -- handle data as far as reasonable. May leave state 141d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # and data to be processed by a subsequent call. If 'end' is 142d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # true, force handling all data as if followed by EOF marker. 143d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def goahead(self, end): 144d995e1150cab57ed7c885d4b7dd943495022936bFred Drake rawdata = self.rawdata 145d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = 0 146d995e1150cab57ed7c885d4b7dd943495022936bFred Drake n = len(rawdata) 147d995e1150cab57ed7c885d4b7dd943495022936bFred Drake while i < n: 148d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = self.interesting.search(rawdata, i) # < or & 149d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if match: 150d995e1150cab57ed7c885d4b7dd943495022936bFred Drake j = match.start() 151d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 15200dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti if self.cdata_elem: 15300dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti break 154d995e1150cab57ed7c885d4b7dd943495022936bFred Drake j = n 155d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if i < j: self.handle_data(rawdata[i:j]) 156d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = self.updatepos(i, j) 157d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if i == n: break 158d995e1150cab57ed7c885d4b7dd943495022936bFred Drake startswith = rawdata.startswith 159d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if startswith('<', i): 160d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if starttagopen.match(rawdata, i): # < + letter 161d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = self.parse_starttag(i) 162d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif startswith("</", i): 163d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = self.parse_endtag(i) 164d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif startswith("<!--", i): 165d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = self.parse_comment(i) 166d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif startswith("<?", i): 167d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = self.parse_pi(i) 168d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif startswith("<!", i): 1694b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti k = self.parse_html_declaration(i) 170d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif (i + 1) < n: 171d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_data("<") 172d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = i + 1 173d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 174d995e1150cab57ed7c885d4b7dd943495022936bFred Drake break 175d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if k < 0: 176d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti if not end: 177d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti break 178d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti k = rawdata.find('>', i + 1) 179d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti if k < 0: 180d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti k = rawdata.find('<', i + 1) 181d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti if k < 0: 182d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti k = i + 1 183d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti else: 184d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti k += 1 185d2307cb48ab09baa846947c5c2c4001dce9b6e52Ezio Melotti self.handle_data(rawdata[i:k]) 186d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = self.updatepos(i, k) 187d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif startswith("&#", i): 188d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = charref.match(rawdata, i) 189d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if match: 190d995e1150cab57ed7c885d4b7dd943495022936bFred Drake name = match.group()[2:-1] 191d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_charref(name) 192d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = match.end() 193d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not startswith(';', k-1): 194d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = k - 1 195d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = self.updatepos(i, k) 196d995e1150cab57ed7c885d4b7dd943495022936bFred Drake continue 197d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 1985a88853bdc1074e62441c7558502bd989c39f056Ezio Melotti if ";" in rawdata[i:]: # bail by consuming '&#' 1995a88853bdc1074e62441c7558502bd989c39f056Ezio Melotti self.handle_data(rawdata[i:i+2]) 2005a88853bdc1074e62441c7558502bd989c39f056Ezio Melotti i = self.updatepos(i, i+2) 201d995e1150cab57ed7c885d4b7dd943495022936bFred Drake break 202d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif startswith('&', i): 203d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = entityref.match(rawdata, i) 204d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if match: 205d995e1150cab57ed7c885d4b7dd943495022936bFred Drake name = match.group(1) 206d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_entityref(name) 207d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = match.end() 208d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not startswith(';', k-1): 209d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = k - 1 210d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = self.updatepos(i, k) 211d995e1150cab57ed7c885d4b7dd943495022936bFred Drake continue 212d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = incomplete.match(rawdata, i) 213d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if match: 214d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # match.group() will contain at least 2 chars 215d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if end and match.group() == rawdata[i:]: 216d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.error("EOF in middle of entity or char ref") 217d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # incomplete 218d995e1150cab57ed7c885d4b7dd943495022936bFred Drake break 219d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif (i + 1) < n: 220d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # not the end of the buffer, and can't be confused 221d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # with some other construct 222d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_data("&") 223d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = self.updatepos(i, i + 1) 224d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 225d995e1150cab57ed7c885d4b7dd943495022936bFred Drake break 226d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 227d995e1150cab57ed7c885d4b7dd943495022936bFred Drake assert 0, "interesting.search() lied" 228d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # end while 22900dc60beee3bf4b68fd658716616f25503a3a9ebEzio Melotti if end and i < n and not self.cdata_elem: 230d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_data(rawdata[i:n]) 231d995e1150cab57ed7c885d4b7dd943495022936bFred Drake i = self.updatepos(i, n) 232d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.rawdata = rawdata[i:] 233d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 2344b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti # Internal -- parse html declarations, return length or -1 if not terminated 2354b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 2364b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti # See also parse_declaration in _markupbase 2374b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti def parse_html_declaration(self, i): 2384b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti rawdata = self.rawdata 2394b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti if rawdata[i:i+2] != '<!': 2404b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti self.error('unexpected call to parse_html_declaration()') 2414b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti if rawdata[i:i+4] == '<!--': 242369cbd744ed06b3e01fe7a2e6a86ea4d85250231Ezio Melotti # this case is actually already handled in goahead() 2434b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return self.parse_comment(i) 2444b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti elif rawdata[i:i+3] == '<![': 2454b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return self.parse_marked_section(i) 2464b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti elif rawdata[i:i+9].lower() == '<!doctype': 2474b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti # find the closing > 248369cbd744ed06b3e01fe7a2e6a86ea4d85250231Ezio Melotti gtpos = rawdata.find('>', i+9) 2494b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti if gtpos == -1: 2504b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return -1 2514b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti self.handle_decl(rawdata[i+2:gtpos]) 2524b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return gtpos+1 2534b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti else: 2544b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return self.parse_bogus_comment(i) 2554b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti 2564b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti # Internal -- parse bogus comment, return length or -1 if not terminated 2574b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 2584b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti def parse_bogus_comment(self, i, report=1): 2594b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti rawdata = self.rawdata 260f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti if rawdata[i:i+2] not in ('<!', '</'): 2614b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti self.error('unexpected call to parse_comment()') 2624b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti pos = rawdata.find('>', i+2) 2634b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti if pos == -1: 2644b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return -1 2654b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti if report: 2664b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti self.handle_comment(rawdata[i+2:pos]) 2674b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti return pos + 1 2684b92cc3f7924e455b7e41cf1a66034a44ede0cc0Ezio Melotti 269d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Internal -- parse processing instr, return end or -1 if not terminated 270d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def parse_pi(self, i): 271d995e1150cab57ed7c885d4b7dd943495022936bFred Drake rawdata = self.rawdata 272d995e1150cab57ed7c885d4b7dd943495022936bFred Drake assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 273d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = piclose.search(rawdata, i+2) # > 274d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not match: 275d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return -1 276d995e1150cab57ed7c885d4b7dd943495022936bFred Drake j = match.start() 277d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_pi(rawdata[i+2: j]) 278d995e1150cab57ed7c885d4b7dd943495022936bFred Drake j = match.end() 279d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return j 280d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 281d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Internal -- handle starttag, return end or -1 if not terminated 282d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def parse_starttag(self, i): 283d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.__starttag_text = None 284d995e1150cab57ed7c885d4b7dd943495022936bFred Drake endpos = self.check_for_whole_start_tag(i) 285d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if endpos < 0: 286d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return endpos 287d995e1150cab57ed7c885d4b7dd943495022936bFred Drake rawdata = self.rawdata 288d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.__starttag_text = rawdata[i:endpos] 289d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 290d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Now parse the data between i+1 and j into a tag and attrs 291d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrs = [] 292d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = tagfind.match(rawdata, i+1) 293d995e1150cab57ed7c885d4b7dd943495022936bFred Drake assert match, 'unexpected call to parse_starttag()' 294d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = match.end() 295c45868ec697a70a80d1cf8a511894f073fda3a27Ezio Melotti self.lasttag = tag = match.group(1).lower() 296d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 297d995e1150cab57ed7c885d4b7dd943495022936bFred Drake while k < endpos: 298d995e1150cab57ed7c885d4b7dd943495022936bFred Drake m = attrfind.match(rawdata, k) 299d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not m: 300d995e1150cab57ed7c885d4b7dd943495022936bFred Drake break 301d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrname, rest, attrvalue = m.group(1, 2, 3) 302d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not rest: 303d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrvalue = None 304d995e1150cab57ed7c885d4b7dd943495022936bFred Drake elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 305d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrvalue[:1] == '"' == attrvalue[-1:]: 306d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrvalue = attrvalue[1:-1] 3070f1571ce7fb7da0e2ad75f941b29f2d19717e012Ezio Melotti if attrvalue: 308d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrvalue = self.unescape(attrvalue) 309d995e1150cab57ed7c885d4b7dd943495022936bFred Drake attrs.append((attrname.lower(), attrvalue)) 310d995e1150cab57ed7c885d4b7dd943495022936bFred Drake k = m.end() 311d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 312d995e1150cab57ed7c885d4b7dd943495022936bFred Drake end = rawdata[k:endpos].strip() 313d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if end not in (">", "/>"): 314d995e1150cab57ed7c885d4b7dd943495022936bFred Drake lineno, offset = self.getpos() 315d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if "\n" in self.__starttag_text: 316d995e1150cab57ed7c885d4b7dd943495022936bFred Drake lineno = lineno + self.__starttag_text.count("\n") 317d995e1150cab57ed7c885d4b7dd943495022936bFred Drake offset = len(self.__starttag_text) \ 318d995e1150cab57ed7c885d4b7dd943495022936bFred Drake - self.__starttag_text.rfind("\n") 319d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 320d995e1150cab57ed7c885d4b7dd943495022936bFred Drake offset = offset + len(self.__starttag_text) 32165d36dab4d915eb9fada52b867301b546e840faeEzio Melotti self.handle_data(rawdata[i:endpos]) 32265d36dab4d915eb9fada52b867301b546e840faeEzio Melotti return endpos 323d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if end.endswith('/>'): 324d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # XHTML-style empty tag: <span attr="value" /> 325d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_startendtag(tag, attrs) 326d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 327d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_starttag(tag, attrs) 328d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if tag in self.CDATA_CONTENT_ELEMENTS: 3297e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti self.set_cdata_mode(tag) 330d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return endpos 331d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 332d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Internal -- check to see if we have a complete starttag; return end 333d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # or -1 if incomplete. 334d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def check_for_whole_start_tag(self, i): 335d995e1150cab57ed7c885d4b7dd943495022936bFred Drake rawdata = self.rawdata 336d995e1150cab57ed7c885d4b7dd943495022936bFred Drake m = locatestarttagend.match(rawdata, i) 337d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if m: 338d995e1150cab57ed7c885d4b7dd943495022936bFred Drake j = m.end() 339d995e1150cab57ed7c885d4b7dd943495022936bFred Drake next = rawdata[j:j+1] 340d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if next == ">": 341d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return j + 1 342d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if next == "/": 343d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if rawdata.startswith("/>", j): 344d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return j + 2 345d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if rawdata.startswith("/", j): 346d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # buffer boundary 347d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return -1 348d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # else bogus input 349d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.updatepos(i, j + 1) 350d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.error("malformed empty start tag") 351d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if next == "": 352d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # end of input 353d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return -1 354d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if next in ("abcdefghijklmnopqrstuvwxyz=/" 355d995e1150cab57ed7c885d4b7dd943495022936bFred Drake "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 356d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # end of input in or before attribute value, or we have the 357d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # '/' from a '/>' ending 358d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return -1 35965d36dab4d915eb9fada52b867301b546e840faeEzio Melotti if j > i: 36065d36dab4d915eb9fada52b867301b546e840faeEzio Melotti return j 36165d36dab4d915eb9fada52b867301b546e840faeEzio Melotti else: 36265d36dab4d915eb9fada52b867301b546e840faeEzio Melotti return i + 1 363d995e1150cab57ed7c885d4b7dd943495022936bFred Drake raise AssertionError("we should not get here!") 364d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 365d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Internal -- parse endtag, return end or -1 if incomplete 366d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def parse_endtag(self, i): 367d995e1150cab57ed7c885d4b7dd943495022936bFred Drake rawdata = self.rawdata 368d995e1150cab57ed7c885d4b7dd943495022936bFred Drake assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 369d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = endendtag.search(rawdata, i+1) # > 370d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not match: 371d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return -1 372f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti gtpos = match.end() 373d995e1150cab57ed7c885d4b7dd943495022936bFred Drake match = endtagfind.match(rawdata, i) # </ + tag + > 374d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if not match: 3757e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti if self.cdata_elem is not None: 376f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti self.handle_data(rawdata[i:gtpos]) 377f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti return gtpos 378f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 379b8147452265077d4c12464a9943903f0d040f79cEzio Melotti namematch = tagfind.match(rawdata, i+2) 380f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti if not namematch: 381f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti # w3.org/TR/html5/tokenization.html#end-tag-open-state 382f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti if rawdata[i:i+3] == '</>': 383f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti return i+3 384f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti else: 385f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti return self.parse_bogus_comment(i) 386b8147452265077d4c12464a9943903f0d040f79cEzio Melotti tagname = namematch.group(1).lower() 387f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti # consume and ignore other stuff between the name and the > 388f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti # Note: this is not 100% correct, since we might have things like 389f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti # </tag attr=">">, but looking for > after tha name should cover 390f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti # most of the cases and is much simpler 391f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti gtpos = rawdata.find('>', namematch.end()) 392f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti self.handle_endtag(tagname) 393f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti return gtpos+1 3947e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti 3957e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti elem = match.group(1).lower() # script or style 3967e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti if self.cdata_elem is not None: 3977e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti if elem != self.cdata_elem: 398f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti self.handle_data(rawdata[i:gtpos]) 399f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti return gtpos 4007e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti 4017e82b276dd5c1f786e7bd3c1554ac2017a909ab9Ezio Melotti self.handle_endtag(elem) 402d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.clear_cdata_mode() 403f117443cb8afa3b2d91b4fef861db17866d6b6dfEzio Melotti return gtpos 404d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 405d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- finish processing of start+end tag: <tag.../> 406d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_startendtag(self, tag, attrs): 407d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_starttag(tag, attrs) 408d995e1150cab57ed7c885d4b7dd943495022936bFred Drake self.handle_endtag(tag) 409d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 410d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle start tag 411d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_starttag(self, tag, attrs): 412d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 413d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 414d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle end tag 415d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_endtag(self, tag): 416d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 417d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 418d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle character reference 419d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_charref(self, name): 420d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 421d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 422d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle entity reference 423d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_entityref(self, name): 424d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 425d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 426d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle data 427d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_data(self, data): 428d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 429d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 430d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle comment 431d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_comment(self, data): 432d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 433d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 434d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle declaration 435d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_decl(self, decl): 436d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 437d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 438d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Overridable -- handle processing instruction 439d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def handle_pi(self, data): 440d995e1150cab57ed7c885d4b7dd943495022936bFred Drake pass 441d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 442d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def unknown_decl(self, data): 443369cbd744ed06b3e01fe7a2e6a86ea4d85250231Ezio Melotti pass 444d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 445d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Internal -- helper to remove special character quoting 446d995e1150cab57ed7c885d4b7dd943495022936bFred Drake entitydefs = None 447d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def unescape(self, s): 448d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if '&' not in s: 449d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return s 450d995e1150cab57ed7c885d4b7dd943495022936bFred Drake def replaceEntities(s): 451d995e1150cab57ed7c885d4b7dd943495022936bFred Drake s = s.groups()[0] 4523f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran try: 4533f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran if s[0] == "#": 4543f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran s = s[1:] 4553f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran if s[0] in ['x','X']: 4563f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran c = int(s[1:], 16) 4573f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran else: 4583f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran c = int(s) 4593f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran return unichr(c) 4603f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran except ValueError: 4613f60f09eb23be3289ac5cc019391711dcdf800b3Senthil Kumaran return '&#'+s+';' 462d995e1150cab57ed7c885d4b7dd943495022936bFred Drake else: 463d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # Cannot use name2codepoint directly, because HTMLParser supports apos, 464d995e1150cab57ed7c885d4b7dd943495022936bFred Drake # which is not part of HTML 4 465d995e1150cab57ed7c885d4b7dd943495022936bFred Drake import htmlentitydefs 466d995e1150cab57ed7c885d4b7dd943495022936bFred Drake if HTMLParser.entitydefs is None: 467d995e1150cab57ed7c885d4b7dd943495022936bFred Drake entitydefs = HTMLParser.entitydefs = {'apos':u"'"} 468d995e1150cab57ed7c885d4b7dd943495022936bFred Drake for k, v in htmlentitydefs.name2codepoint.iteritems(): 469d995e1150cab57ed7c885d4b7dd943495022936bFred Drake entitydefs[k] = unichr(v) 470d995e1150cab57ed7c885d4b7dd943495022936bFred Drake try: 471d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return self.entitydefs[s] 472d995e1150cab57ed7c885d4b7dd943495022936bFred Drake except KeyError: 473d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return '&'+s+';' 474d995e1150cab57ed7c885d4b7dd943495022936bFred Drake 475d995e1150cab57ed7c885d4b7dd943495022936bFred Drake return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) 476