10a8c90248264a8b26970b4473770bcc3df8515fJosh Gao"""A parser for HTML and XHTML.""" 20a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 30a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# This file is based on sgmllib.py, but the API is slightly different. 40a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 50a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# XXX There should be a way to distinguish between PCDATA (parsed 60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# character data -- the normal case), RCDATA (replaceable character 70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# data -- only char and entity references and end tags are special) 80a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# and CDATA (character data -- only end tags are special). 90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 110a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport markupbase 120a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport re 130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Regular expressions used for parsing 150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 160a8c90248264a8b26970b4473770bcc3df8515fJosh Gaointeresting_normal = re.compile('[&<]') 170a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoincomplete = re.compile('&[a-zA-Z#]') 180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 190a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 200a8c90248264a8b26970b4473770bcc3df8515fJosh Gaocharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 220a8c90248264a8b26970b4473770bcc3df8515fJosh Gaostarttagopen = re.compile('<[a-zA-Z]') 230a8c90248264a8b26970b4473770bcc3df8515fJosh Gaopiclose = re.compile('>') 240a8c90248264a8b26970b4473770bcc3df8515fJosh Gaocommentclose = re.compile(r'--\s*>') 250a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') 260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 280a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 300a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoattrfind = re.compile( 310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 340a8c90248264a8b26970b4473770bcc3df8515fJosh Gaolocatestarttagend = re.compile(r""" 350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao (?:[\s/]* # optional whitespace before attribute name 370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao (?:\s*=+\s* # value indicator 390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao (?:'[^']*' # LITA-enclosed value 400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao |"[^"]*" # LIT-enclosed value 410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao |(?!['"])[^>\s]* # bare value 420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ) 430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao )?(?:\s|/(?!>))* 440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao )* 450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao )? 460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao \s* # trailing whitespace 470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao""", re.VERBOSE) 480a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoendendtag = re.compile('>') 490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# </ and the tag name, so maybe this should be fixed 510a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoendtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 540a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass HTMLParseError(Exception): 550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Exception raised for all parse errors.""" 560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def __init__(self, msg, position=(None, None)): 580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao assert msg 590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.msg = msg 600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.lineno = position[0] 610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.offset = position[1] 620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def __str__(self): 640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao result = self.msg 650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.lineno is not None: 660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao result = result + ", at line %d" % self.lineno 670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.offset is not None: 680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao result = result + ", column %d" % (self.offset + 1) 690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return result 700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 720a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass HTMLParser(markupbase.ParserBase): 730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Find tags and other markup and call handler functions. 740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao Usage: 760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao p = HTMLParser() 770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao p.feed(data) 780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ... 790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao p.close() 800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao Start tags are handled by calling self.handle_starttag() or 820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_startendtag(); end tags by self.handle_endtag(). The 830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data between tags is passed from the parser to the derived class 840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao by calling self.handle_data() with the data as argument (the data 850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao may be split up in arbitrary chunks). Entity references are 860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao passed by calling self.handle_entityref() with the entity 870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao reference as the argument. Numeric character references are 880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao passed to self.handle_charref() with the string containing the 890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao reference as the argument. 900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """ 910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao CDATA_CONTENT_ELEMENTS = ("script", "style") 930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def __init__(self): 960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Initialize and reset this instance.""" 970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.reset() 980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def reset(self): 1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Reset this instance. Loses all unprocessed data.""" 1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.rawdata = '' 1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.lasttag = '???' 1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.interesting = interesting_normal 1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.cdata_elem = None 1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao markupbase.ParserBase.reset(self) 1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def feed(self, data): 1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r"""Feed data to the parser. 1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao Call this as often as you want, with as little or as much text 1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao as you want (may include '\n'). 1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """ 1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.rawdata = self.rawdata + data 1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.goahead(0) 1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def close(self): 1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Handle any buffered data.""" 1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.goahead(1) 1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def error(self, message): 1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise HTMLParseError(message, self.getpos()) 1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao __starttag_text = None 1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def get_starttag_text(self): 1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Return full source of start tag: '<...>'.""" 1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.__starttag_text 1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def set_cdata_mode(self, elem): 1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.cdata_elem = elem.lower() 1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def clear_cdata_mode(self): 1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.interesting = interesting_normal 1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.cdata_elem = None 1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- handle data as far as reasonable. May leave state 1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # and data to be processed by a subsequent call. If 'end' is 1390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # true, force handling all data as if followed by EOF marker. 1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def goahead(self, end): 1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = 0 1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao n = len(rawdata) 1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while i < n: 1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = self.interesting.search(rawdata, i) # < or & 1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: 1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.start() 1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.cdata_elem: 1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = n 1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if i < j: self.handle_data(rawdata[i:j]) 1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, j) 1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if i == n: break 1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao startswith = rawdata.startswith 1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if startswith('<', i): 1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if starttagopen.match(rawdata, i): # < + letter 1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_starttag(i) 1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif startswith("</", i): 1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_endtag(i) 1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif startswith("<!--", i): 1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_comment(i) 1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif startswith("<?", i): 1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_pi(i) 1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif startswith("<!", i): 1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_html_declaration(i) 1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif (i + 1) < n: 1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data("<") 1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = i + 1 1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: 1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not end: 1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = rawdata.find('>', i + 1) 1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: 1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = rawdata.find('<', i + 1) 1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: 1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = i + 1 1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k += 1 1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:k]) 1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, k) 1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif startswith("&#", i): 1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = charref.match(rawdata, i) 1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: 1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao name = match.group()[2:-1] 1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_charref(name) 1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = match.end() 1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not startswith(';', k-1): 1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = k - 1 1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, k) 1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if ";" in rawdata[i:]: #bail by consuming &# 1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[0:2]) 1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, 2) 1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif startswith('&', i): 2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = entityref.match(rawdata, i) 2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: 2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao name = match.group(1) 2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_entityref(name) 2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = match.end() 2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not startswith(';', k-1): 2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = k - 1 2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, k) 2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = incomplete.match(rawdata, i) 2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: 2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # match.group() will contain at least 2 chars 2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if end and match.group() == rawdata[i:]: 2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error("EOF in middle of entity or char ref") 2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # incomplete 2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif (i + 1) < n: 2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # not the end of the buffer, and can't be confused 2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # with some other construct 2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data("&") 2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, i + 1) 2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao assert 0, "interesting.search() lied" 2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # end while 2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if end and i < n and not self.cdata_elem: 2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:n]) 2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = self.updatepos(i, n) 2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.rawdata = rawdata[i:] 2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- parse html declarations, return length or -1 if not terminated 2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # See also parse_declaration in _markupbase 2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_html_declaration(self, i): 2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i:i+2] != '<!': 2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error('unexpected call to parse_html_declaration()') 2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i:i+4] == '<!--': 2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # this case is actually already handled in goahead() 2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.parse_comment(i) 2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif rawdata[i:i+3] == '<![': 2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.parse_marked_section(i) 2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif rawdata[i:i+9].lower() == '<!doctype': 2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # find the closing > 2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao gtpos = rawdata.find('>', i+9) 2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if gtpos == -1: 2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_decl(rawdata[i+2:gtpos]) 2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return gtpos+1 2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.parse_bogus_comment(i) 2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- parse bogus comment, return length or -1 if not terminated 2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_bogus_comment(self, i, report=1): 2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i:i+2] not in ('<!', '</'): 2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error('unexpected call to parse_comment()') 2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pos = rawdata.find('>', i+2) 2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if pos == -1: 2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if report: 2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_comment(rawdata[i+2:pos]) 2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return pos + 1 2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- parse processing instr, return end or -1 if not terminated 2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_pi(self, i): 2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = piclose.search(rawdata, i+2) # > 2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.start() 2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_pi(rawdata[i+2: j]) 2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.end() 2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j 2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- handle starttag, return end or -1 if not terminated 2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_starttag(self, i): 2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = None 2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao endpos = self.check_for_whole_start_tag(i) 2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if endpos < 0: 2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return endpos 2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = rawdata[i:endpos] 2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Now parse the data between i+1 and j into a tag and attrs 2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrs = [] 2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = tagfind.match(rawdata, i+1) 2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao assert match, 'unexpected call to parse_starttag()' 2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = match.end() 2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.lasttag = tag = match.group(1).lower() 2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while k < endpos: 2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao m = attrfind.match(rawdata, k) 2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not m: 2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrname, rest, attrvalue = m.group(1, 2, 3) 2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not rest: 3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue = None 3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue[:1] == '"' == attrvalue[-1:]: 3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue = attrvalue[1:-1] 3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if attrvalue: 3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue = self.unescape(attrvalue) 3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrs.append((attrname.lower(), attrvalue)) 3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = m.end() 3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao end = rawdata[k:endpos].strip() 3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if end not in (">", "/>"): 3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao lineno, offset = self.getpos() 3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if "\n" in self.__starttag_text: 3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao lineno = lineno + self.__starttag_text.count("\n") 3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao offset = len(self.__starttag_text) \ 3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao - self.__starttag_text.rfind("\n") 3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao offset = offset + len(self.__starttag_text) 3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:endpos]) 3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return endpos 3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if end.endswith('/>'): 3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XHTML-style empty tag: <span attr="value" /> 3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_startendtag(tag, attrs) 3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_starttag(tag, attrs) 3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if tag in self.CDATA_CONTENT_ELEMENTS: 3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.set_cdata_mode(tag) 3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return endpos 3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- check to see if we have a complete starttag; return end 3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # or -1 if incomplete. 3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def check_for_whole_start_tag(self, i): 3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao m = locatestarttagend.match(rawdata, i) 3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if m: 3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = m.end() 3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao next = rawdata[j:j+1] 3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if next == ">": 3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j + 1 3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if next == "/": 3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata.startswith("/>", j): 3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j + 2 3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata.startswith("/", j): 3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # buffer boundary 3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # else bogus input 3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.updatepos(i, j + 1) 3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error("malformed empty start tag") 3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if next == "": 3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # end of input 3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if next in ("abcdefghijklmnopqrstuvwxyz=/" 3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # end of input in or before attribute value, or we have the 3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # '/' from a '/>' ending 3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if j > i: 3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j 3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return i + 1 3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise AssertionError("we should not get here!") 3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- parse endtag, return end or -1 if incomplete 3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_endtag(self, i): 3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = endendtag.search(rawdata, i+1) # > 3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao gtpos = match.end() 3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = endtagfind.match(rawdata, i) # </ + tag + > 3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.cdata_elem is not None: 3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:gtpos]) 3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return gtpos 3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao namematch = tagfind_tolerant.match(rawdata, i+2) 3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not namematch: 3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # w3.org/TR/html5/tokenization.html#end-tag-open-state 3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i:i+3] == '</>': 3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return i+3 3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.parse_bogus_comment(i) 3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tagname = namematch.group().lower() 3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # consume and ignore other stuff between the name and the > 3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Note: this is not 100% correct, since we might have things like 3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # </tag attr=">">, but looking for > after tha name should cover 3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # most of the cases and is much simpler 3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao gtpos = rawdata.find('>', namematch.end()) 3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_endtag(tagname) 3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return gtpos+1 3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elem = match.group(1).lower() # script or style 3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.cdata_elem is not None: 3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if elem != self.cdata_elem: 3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:gtpos]) 3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return gtpos 3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_endtag(elem) 3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.clear_cdata_mode() 4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return gtpos 4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- finish processing of start+end tag: <tag.../> 4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_startendtag(self, tag, attrs): 4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_starttag(tag, attrs) 4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_endtag(tag) 4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle start tag 4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_starttag(self, tag, attrs): 4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle end tag 4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_endtag(self, tag): 4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle character reference 4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_charref(self, name): 4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle entity reference 4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_entityref(self, name): 4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle data 4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_data(self, data): 4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle comment 4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_comment(self, data): 4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle declaration 4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_decl(self, decl): 4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle processing instruction 4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_pi(self, data): 4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_decl(self, data): 4400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- helper to remove special character quoting 4430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao entitydefs = None 4440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unescape(self, s): 4450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if '&' not in s: 4460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return s 4470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def replaceEntities(s): 4480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = s.groups()[0] 4490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 4500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if s[0] == "#": 4510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = s[1:] 4520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if s[0] in ['x','X']: 4530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao c = int(s[1:], 16) 4540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao c = int(s) 4560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return unichr(c) 4570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except ValueError: 4580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return '&#'+s+';' 4590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Cannot use name2codepoint directly, because HTMLParser supports apos, 4610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # which is not part of HTML 4 4620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao import htmlentitydefs 4630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if HTMLParser.entitydefs is None: 4640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao entitydefs = HTMLParser.entitydefs = {'apos':u"'"} 4650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for k, v in htmlentitydefs.name2codepoint.iteritems(): 4660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao entitydefs[k] = unichr(v) 4670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 4680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.entitydefs[s] 4690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except KeyError: 4700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return '&'+s+';' 4710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) 473