14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao"""A parser for HTML and XHTML.""" 24adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# This file is based on sgmllib.py, but the API is slightly different. 44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# XXX There should be a way to distinguish between PCDATA (parsed 64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# character data -- the normal case), RCDATA (replaceable character 74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# data -- only char and entity references and end tags are special) 84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# and CDATA (character data -- only end tags are special). 94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport markupbase 124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport re 134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Regular expressions used for parsing 154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaointeresting_normal = re.compile('[&<]') 174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoincomplete = re.compile('&[a-zA-Z#]') 184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaocharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaostarttagopen = re.compile('<[a-zA-Z]') 234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaopiclose = re.compile('>') 244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaocommentclose = re.compile(r'--\s*>') 254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') 264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoattrfind = re.compile( 314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaolocatestarttagend = re.compile(r""" 354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (?:[\s/]* # optional whitespace before attribute name 374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (?:\s*=+\s* # value indicator 394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (?:'[^']*' # LITA-enclosed value 404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao |"[^"]*" # LIT-enclosed value 414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao |(?!['"])[^>\s]* # bare value 424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao ) 434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao )?(?:\s|/(?!>))* 444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao )* 454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao )? 464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao \s* # trailing whitespace 474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao""", re.VERBOSE) 484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendendtag = re.compile('>') 494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# </ and the tag name, so maybe this should be fixed 514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass HTMLParseError(Exception): 554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Exception raised for all parse errors.""" 564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def __init__(self, msg, position=(None, None)): 584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert msg 594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.msg = msg 604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.lineno = position[0] 614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.offset = position[1] 624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def __str__(self): 644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao result = self.msg 654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.lineno is not None: 664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao result = result + ", at line %d" % self.lineno 674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.offset is not None: 684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao result = result + ", column %d" % (self.offset + 1) 694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return result 704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass HTMLParser(markupbase.ParserBase): 734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Find tags and other markup and call handler functions. 744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Usage: 764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao p = HTMLParser() 774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao p.feed(data) 784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao ... 794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao p.close() 804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Start tags are handled by calling self.handle_starttag() or 824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_startendtag(); end tags by self.handle_endtag(). The 834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao data between tags is passed from the parser to the derived class 844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao by calling self.handle_data() with the data as argument (the data 854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao may be split up in arbitrary chunks). Entity references are 864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao passed by calling self.handle_entityref() with the entity 874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao reference as the argument. Numeric character references are 884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao passed to self.handle_charref() with the string containing the 894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao reference as the argument. 904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao CDATA_CONTENT_ELEMENTS = ("script", "style") 934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def __init__(self): 964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Initialize and reset this instance.""" 974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.reset() 984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def reset(self): 1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Reset this instance. Loses all unprocessed data.""" 1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.rawdata = '' 1024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.lasttag = '???' 1034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.interesting = interesting_normal 1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.cdata_elem = None 1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao markupbase.ParserBase.reset(self) 1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def feed(self, data): 1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r"""Feed data to the parser. 1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Call this as often as you want, with as little or as much text 1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao as you want (may include '\n'). 1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.rawdata = self.rawdata + data 1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.goahead(0) 1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def close(self): 1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Handle any buffered data.""" 1184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.goahead(1) 1194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def error(self, message): 1214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao raise HTMLParseError(message, self.getpos()) 1224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao __starttag_text = None 1244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def get_starttag_text(self): 1264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Return full source of start tag: '<...>'.""" 1274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.__starttag_text 1284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def set_cdata_mode(self, elem): 1304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.cdata_elem = elem.lower() 1314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 1324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def clear_cdata_mode(self): 1344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.interesting = interesting_normal 1354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.cdata_elem = None 1364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- handle data as far as reasonable. May leave state 1384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # and data to be processed by a subsequent call. If 'end' is 1394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # true, force handling all data as if followed by EOF marker. 1404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def goahead(self, end): 1414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 1424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = 0 1434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao n = len(rawdata) 1444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao while i < n: 1454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = self.interesting.search(rawdata, i) # < or & 1464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if match: 1474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao j = match.start() 1484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 1494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.cdata_elem: 1504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 1514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao j = n 1524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if i < j: self.handle_data(rawdata[i:j]) 1534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, j) 1544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if i == n: break 1554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao startswith = rawdata.startswith 1564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if startswith('<', i): 1574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if starttagopen.match(rawdata, i): # < + letter 1584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = self.parse_starttag(i) 1594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startswith("</", i): 1604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = self.parse_endtag(i) 1614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startswith("<!--", i): 1624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = self.parse_comment(i) 1634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startswith("<?", i): 1644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = self.parse_pi(i) 1654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startswith("<!", i): 1664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = self.parse_html_declaration(i) 1674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif (i + 1) < n: 1684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data("<") 1694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = i + 1 1704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 1714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 1724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if k < 0: 1734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not end: 1744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 1754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = rawdata.find('>', i + 1) 1764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if k < 0: 1774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = rawdata.find('<', i + 1) 1784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if k < 0: 1794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = i + 1 1804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 1814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k += 1 1824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data(rawdata[i:k]) 1834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, k) 1844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startswith("&#", i): 1854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = charref.match(rawdata, i) 1864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if match: 1874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao name = match.group()[2:-1] 1884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_charref(name) 1894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = match.end() 1904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not startswith(';', k-1): 1914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = k - 1 1924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, k) 1934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 1944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 1954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if ";" in rawdata[i:]: #bail by consuming &# 1964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data(rawdata[0:2]) 1974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, 2) 1984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 1994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startswith('&', i): 2004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = entityref.match(rawdata, i) 2014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if match: 2024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao name = match.group(1) 2034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_entityref(name) 2044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = match.end() 2054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not startswith(';', k-1): 2064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = k - 1 2074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, k) 2084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 2094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = incomplete.match(rawdata, i) 2104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if match: 2114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # match.group() will contain at least 2 chars 2124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if end and match.group() == rawdata[i:]: 2134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.error("EOF in middle of entity or char ref") 2144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # incomplete 2154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 2164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif (i + 1) < n: 2174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # not the end of the buffer, and can't be confused 2184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # with some other construct 2194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data("&") 2204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, i + 1) 2214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 2224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 2234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 2244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert 0, "interesting.search() lied" 2254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # end while 2264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if end and i < n and not self.cdata_elem: 2274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data(rawdata[i:n]) 2284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao i = self.updatepos(i, n) 2294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.rawdata = rawdata[i:] 2304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- parse html declarations, return length or -1 if not terminated 2324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 2334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # See also parse_declaration in _markupbase 2344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def parse_html_declaration(self, i): 2354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 2364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if rawdata[i:i+2] != '<!': 2374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.error('unexpected call to parse_html_declaration()') 2384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if rawdata[i:i+4] == '<!--': 2394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # this case is actually already handled in goahead() 2404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.parse_comment(i) 2414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif rawdata[i:i+3] == '<![': 2424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.parse_marked_section(i) 2434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif rawdata[i:i+9].lower() == '<!doctype': 2444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # find the closing > 2454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao gtpos = rawdata.find('>', i+9) 2464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if gtpos == -1: 2474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 2484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_decl(rawdata[i+2:gtpos]) 2494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return gtpos+1 2504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 2514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.parse_bogus_comment(i) 2524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- parse bogus comment, return length or -1 if not terminated 2544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 2554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def parse_bogus_comment(self, i, report=1): 2564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 2574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if rawdata[i:i+2] not in ('<!', '</'): 2584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.error('unexpected call to parse_comment()') 2594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pos = rawdata.find('>', i+2) 2604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if pos == -1: 2614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 2624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if report: 2634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_comment(rawdata[i+2:pos]) 2644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return pos + 1 2654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- parse processing instr, return end or -1 if not terminated 2674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def parse_pi(self, i): 2684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 2694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 2704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = piclose.search(rawdata, i+2) # > 2714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not match: 2724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 2734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao j = match.start() 2744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_pi(rawdata[i+2: j]) 2754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao j = match.end() 2764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return j 2774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- handle starttag, return end or -1 if not terminated 2794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def parse_starttag(self, i): 2804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.__starttag_text = None 2814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao endpos = self.check_for_whole_start_tag(i) 2824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if endpos < 0: 2834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return endpos 2844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 2854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.__starttag_text = rawdata[i:endpos] 2864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Now parse the data between i+1 and j into a tag and attrs 2884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrs = [] 2894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = tagfind.match(rawdata, i+1) 2904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert match, 'unexpected call to parse_starttag()' 2914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = match.end() 2924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.lasttag = tag = match.group(1).lower() 2934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao while k < endpos: 2954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao m = attrfind.match(rawdata, k) 2964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not m: 2974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 2984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrname, rest, attrvalue = m.group(1, 2, 3) 2994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not rest: 3004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrvalue = None 3014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 3024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrvalue[:1] == '"' == attrvalue[-1:]: 3034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrvalue = attrvalue[1:-1] 3044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if attrvalue: 3054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrvalue = self.unescape(attrvalue) 3064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao attrs.append((attrname.lower(), attrvalue)) 3074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao k = m.end() 3084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao end = rawdata[k:endpos].strip() 3104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if end not in (">", "/>"): 3114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao lineno, offset = self.getpos() 3124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if "\n" in self.__starttag_text: 3134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao lineno = lineno + self.__starttag_text.count("\n") 3144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao offset = len(self.__starttag_text) \ 3154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao - self.__starttag_text.rfind("\n") 3164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao offset = offset + len(self.__starttag_text) 3184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data(rawdata[i:endpos]) 3194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return endpos 3204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if end.endswith('/>'): 3214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # XHTML-style empty tag: <span attr="value" /> 3224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_startendtag(tag, attrs) 3234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_starttag(tag, attrs) 3254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if tag in self.CDATA_CONTENT_ELEMENTS: 3264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.set_cdata_mode(tag) 3274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return endpos 3284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- check to see if we have a complete starttag; return end 3304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # or -1 if incomplete. 3314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def check_for_whole_start_tag(self, i): 3324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 3334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao m = locatestarttagend.match(rawdata, i) 3344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if m: 3354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao j = m.end() 3364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao next = rawdata[j:j+1] 3374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if next == ">": 3384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return j + 1 3394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if next == "/": 3404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if rawdata.startswith("/>", j): 3414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return j + 2 3424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if rawdata.startswith("/", j): 3434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # buffer boundary 3444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 3454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # else bogus input 3464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.updatepos(i, j + 1) 3474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.error("malformed empty start tag") 3484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if next == "": 3494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # end of input 3504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 3514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if next in ("abcdefghijklmnopqrstuvwxyz=/" 3524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 3534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # end of input in or before attribute value, or we have the 3544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # '/' from a '/>' ending 3554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 3564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if j > i: 3574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return j 3584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return i + 1 3604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao raise AssertionError("we should not get here!") 3614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- parse endtag, return end or -1 if incomplete 3634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def parse_endtag(self, i): 3644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao rawdata = self.rawdata 3654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 3664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = endendtag.search(rawdata, i+1) # > 3674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not match: 3684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return -1 3694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao gtpos = match.end() 3704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao match = endtagfind.match(rawdata, i) # </ + tag + > 3714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not match: 3724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.cdata_elem is not None: 3734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data(rawdata[i:gtpos]) 3744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return gtpos 3754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 3764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao namematch = tagfind_tolerant.match(rawdata, i+2) 3774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not namematch: 3784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # w3.org/TR/html5/tokenization.html#end-tag-open-state 3794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if rawdata[i:i+3] == '</>': 3804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return i+3 3814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.parse_bogus_comment(i) 3834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tagname = namematch.group().lower() 3844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # consume and ignore other stuff between the name and the > 3854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Note: this is not 100% correct, since we might have things like 3864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # </tag attr=">">, but looking for > after tha name should cover 3874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # most of the cases and is much simpler 3884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao gtpos = rawdata.find('>', namematch.end()) 3894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_endtag(tagname) 3904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return gtpos+1 3914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elem = match.group(1).lower() # script or style 3934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.cdata_elem is not None: 3944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if elem != self.cdata_elem: 3954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_data(rawdata[i:gtpos]) 3964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return gtpos 3974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_endtag(elem) 3994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.clear_cdata_mode() 4004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return gtpos 4014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- finish processing of start+end tag: <tag.../> 4034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_startendtag(self, tag, attrs): 4044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_starttag(tag, attrs) 4054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.handle_endtag(tag) 4064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle start tag 4084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_starttag(self, tag, attrs): 4094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle end tag 4124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_endtag(self, tag): 4134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle character reference 4164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_charref(self, name): 4174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle entity reference 4204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_entityref(self, name): 4214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle data 4244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_data(self, data): 4254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle comment 4284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_comment(self, data): 4294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle declaration 4324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_decl(self, decl): 4334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Overridable -- handle processing instruction 4364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def handle_pi(self, data): 4374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def unknown_decl(self, data): 4404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 4414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Internal -- helper to remove special character quoting 4434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao entitydefs = None 4444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def unescape(self, s): 4454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if '&' not in s: 4464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return s 4474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def replaceEntities(s): 4484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao s = s.groups()[0] 4494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao try: 4504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if s[0] == "#": 4514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao s = s[1:] 4524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if s[0] in ['x','X']: 4534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao c = int(s[1:], 16) 4544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 4554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao c = int(s) 4564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return unichr(c) 4574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao except ValueError: 4584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return '&#'+s+';' 4594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 4604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Cannot use name2codepoint directly, because HTMLParser supports apos, 4614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # which is not part of HTML 4 4624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao import htmlentitydefs 4634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if HTMLParser.entitydefs is None: 4644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao entitydefs = HTMLParser.entitydefs = {'apos':u"'"} 4654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao for k, v in htmlentitydefs.name2codepoint.iteritems(): 4664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao entitydefs[k] = unichr(v) 4674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao try: 4684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.entitydefs[s] 4694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao except KeyError: 4704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return '&'+s+';' 4714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) 473