10a8c90248264a8b26970b4473770bcc3df8515fJosh Gao"""A parser for SGML, using the derived class as a static DTD.""" 20a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 30a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# XXX This only supports those SGML features used by HTML. 40a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 50a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# XXX There should be a way to distinguish between PCDATA (parsed 60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# character data -- the normal case), RCDATA (replaceable character 70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# data -- only char and entity references and end tags are special) 80a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# and CDATA (character data -- only end tags are special). RCDATA is 90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# not supported at all. 100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 120a8c90248264a8b26970b4473770bcc3df8515fJosh Gaofrom warnings import warnpy3k 130a8c90248264a8b26970b4473770bcc3df8515fJosh Gaowarnpy3k("the sgmllib module has been removed in Python 3.0", 140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao stacklevel=2) 150a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodel warnpy3k 160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 170a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport markupbase 180a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport re 190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao__all__ = ["SGMLParser", "SGMLParseError"] 210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Regular expressions used for parsing 230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 240a8c90248264a8b26970b4473770bcc3df8515fJosh Gaointeresting = re.compile('[&<]') 250a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoincomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' 260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '<([a-zA-Z][^<>]*|' 270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '/([a-zA-Z][^<>]*)?|' 280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '![^<>]*)?') 290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 300a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoentityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 310a8c90248264a8b26970b4473770bcc3df8515fJosh Gaocharref = re.compile('&#([0-9]+)[^0-9]') 320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 330a8c90248264a8b26970b4473770bcc3df8515fJosh Gaostarttagopen = re.compile('<[>a-zA-Z]') 340a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoshorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') 350a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoshorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') 360a8c90248264a8b26970b4473770bcc3df8515fJosh Gaopiclose = re.compile('>') 370a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoendbracket = re.compile('[<>]') 380a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') 390a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoattrfind = re.compile( 400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' 410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') 420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 440a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass SGMLParseError(RuntimeError): 450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Exception raised for all parse errors.""" 460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# SGML parser base class -- find tags and call handler functions. 500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). 510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# The dtd is defined by deriving a class which defines methods 520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# with special names to handle tags: start_foo and end_foo to handle 530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. 540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# (Tags are converted to lower case for this purpose.) The data 550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# between tags is passed to the parser by calling self.handle_data() 560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# with some data as argument (the data may be split up in arbitrary 570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# chunks). Entity references are passed by calling 580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# self.handle_entityref() with the entity reference as argument. 590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 600a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass SGMLParser(markupbase.ParserBase): 610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Definition of entities -- derived classes may override 620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao entity_or_charref = re.compile('&(?:' 630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' 640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ')(;?)') 650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def __init__(self, verbose=0): 670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Initialize and reset this instance.""" 680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.verbose = verbose 690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.reset() 700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def reset(self): 720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Reset this instance. Loses all unprocessed data.""" 730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = None 740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.rawdata = '' 750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.stack = [] 760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.lasttag = '???' 770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.nomoretags = 0 780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.literal = 0 790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao markupbase.ParserBase.reset(self) 800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def setnomoretags(self): 820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Enter literal mode (CDATA) till EOF. 830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao Intended for derived classes only. 850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """ 860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.nomoretags = self.literal = 1 870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def setliteral(self, *args): 890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Enter literal mode (CDATA). 900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao Intended for derived classes only. 920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """ 930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.literal = 1 940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def feed(self, data): 960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Feed some data to the parser. 970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao Call this as often as you want, with as little or as much text 990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao as you want (may include '\n'). (This just saves the text, 1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao all the processing is done by goahead().) 1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """ 1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.rawdata = self.rawdata + data 1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.goahead(0) 1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def close(self): 1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Handle the remaining data.""" 1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.goahead(1) 1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def error(self, message): 1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise SGMLParseError(message) 1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- handle data as far as reasonable. May leave state 1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # and data to be processed by a subsequent call. If 'end' is 1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # true, force handling all data as if followed by EOF marker. 1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def goahead(self, end): 1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = 0 1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao n = len(rawdata) 1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while i < n: 1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.nomoretags: 1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:n]) 1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = n 1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = interesting.search(rawdata, i) 1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: j = match.start() 1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: j = n 1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if i < j: 1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:j]) 1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = j 1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if i == n: break 1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i] == '<': 1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if starttagopen.match(rawdata, i): 1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.literal: 1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i]) 1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = i+1 1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_starttag(i) 1390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: break 1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = k 1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata.startswith("</", i): 1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_endtag(i) 1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: break 1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = k 1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.literal = 0 1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.literal: 1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if n > (i + 1): 1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data("<") 1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = i+1 1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # incomplete 1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata.startswith("<!--", i): 1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Strictly speaking, a comment is --.*-- 1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # within a declaration tag <!...>. 1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # This should be removed, 1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # and comments handled only in parse_declaration. 1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_comment(i) 1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: break 1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = k 1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata.startswith("<?", i): 1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_pi(i) 1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: break 1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = i+k 1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata.startswith("<!", i): 1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # This is some sort of declaration; in "HTML as 1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # deployed," this should only be the document type 1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # declaration ("<!DOCTYPE html...>"). 1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = self.parse_declaration(i) 1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if k < 0: break 1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = k 1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif rawdata[i] == '&': 1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.literal: 1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i]) 1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = i+1 1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = charref.match(rawdata, i) 1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: 1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao name = match.group(1) 1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_charref(name) 1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = match.end(0) 1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i-1] != ';': i = i-1 1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = entityref.match(rawdata, i) 1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match: 1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao name = match.group(1) 1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_entityref(name) 1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = match.end(0) 1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i-1] != ';': i = i-1 1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error('neither < nor & ??') 1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # We get here only if incomplete matches but 2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # nothing else 2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = incomplete.match(rawdata, i) 2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i]) 2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = i+1 2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao continue 2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.end(0) 2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if j == n: 2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break # Really incomplete 2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:j]) 2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = j 2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # end while 2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if end and i < n: 2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(rawdata[i:n]) 2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = n 2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.rawdata = rawdata[i:] 2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX if end: check for empty stack 2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Extensions for the DOCTYPE scanner: 2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _decl_otherchars = '=' 2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- parse processing instr, return length or -1 if not terminated 2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_pi(self, i): 2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i:i+2] != '<?': 2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error('unexpected call to parse_pi()') 2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = piclose.search(rawdata, i+2) 2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.start(0) 2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_pi(rawdata[i+2: j]) 2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.end(0) 2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j-i 2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def get_starttag_text(self): 2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.__starttag_text 2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- handle starttag, return length or -1 if not terminated 2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_starttag(self, i): 2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = None 2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao start_pos = i 2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if shorttagopen.match(rawdata, i): 2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SGML shorthand: <tag/data/ == <tag>data</tag> 2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX Can data contain &... (entity or char refs)? 2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX Can data contain < or > (tag characters)? 2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX Can there be whitespace before the first /? 2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = shorttag.match(rawdata, i) 2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tag, data = match.group(1, 2) 2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = '<%s/' % tag 2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tag = tag.lower() 2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = match.end(0) 2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.finish_shorttag(tag, data) 2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = rawdata[start_pos:match.end(1) + 1] 2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return k 2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX The following should skip matching quotes (' or ") 2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # As a shortcut way to exit, this isn't so bad, but shouldn't 2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # be used to locate the actual end of the start tag since the 2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # < or > characters may be embedded in an attribute value. 2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = endbracket.search(rawdata, i+1) 2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.start(0) 2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Now parse the data between i+1 and j into a tag and attrs 2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrs = [] 2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[i:i+2] == '<>': 2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SGML shorthand: <> == <last open tag seen> 2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = j 2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tag = self.lasttag 2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = tagfind.match(rawdata, i+1) 2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.error('unexpected call to parse_starttag') 2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = match.end(0) 2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tag = rawdata[i+1:k].lower() 2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.lasttag = tag 2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while k < j: 2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = attrfind.match(rawdata, k) 2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: break 2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrname, rest, attrvalue = match.group(1, 2, 3) 2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not rest: 2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue = attrname 2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if (attrvalue[:1] == "'" == attrvalue[-1:] or 2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue[:1] == '"' == attrvalue[-1:]): 2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # strip quotes 2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue = attrvalue[1:-1] 2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrvalue = self.entity_or_charref.sub( 2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self._convert_ref, attrvalue) 2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao attrs.append((attrname.lower(), attrvalue)) 2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao k = match.end(0) 2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[j] == '>': 2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = j+1 2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.__starttag_text = rawdata[start_pos:j] 2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.finish_starttag(tag, attrs) 2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j 2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- convert entity or character reference 3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def _convert_ref(self, match): 3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if match.group(2): 3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.convert_charref(match.group(2)) or \ 3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '&#%s%s' % match.groups()[1:] 3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif match.group(3): 3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.convert_entityref(match.group(1)) or \ 3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '&%s;' % match.group(1) 3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return '&%s' % match.group(1) 3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- parse endtag 3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def parse_endtag(self, i): 3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao rawdata = self.rawdata 3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao match = endbracket.search(rawdata, i+1) 3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not match: 3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = match.start(0) 3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tag = rawdata[i+2:j].strip().lower() 3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if rawdata[j] == '>': 3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j = j+1 3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.finish_endtag(tag) 3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return j 3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) 3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def finish_shorttag(self, tag, data): 3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.finish_starttag(tag, []) 3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(data) 3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.finish_endtag(tag) 3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- finish processing of start tag 3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag 3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def finish_starttag(self, tag, attrs): 3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method = getattr(self, 'start_' + tag) 3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except AttributeError: 3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method = getattr(self, 'do_' + tag) 3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except AttributeError: 3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.unknown_starttag(tag, attrs) 3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return -1 3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_starttag(tag, method, attrs) 3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 0 3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.stack.append(tag) 3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_starttag(tag, method, attrs) 3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 1 3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Internal -- finish processing of end tag 3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def finish_endtag(self, tag): 3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not tag: 3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao found = len(self.stack) - 1 3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if found < 0: 3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.unknown_endtag(tag) 3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if tag not in self.stack: 3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method = getattr(self, 'end_' + tag) 3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except AttributeError: 3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.unknown_endtag(tag) 3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.report_unbalanced(tag) 3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao found = len(self.stack) 3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for i in range(found): 3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.stack[i] == tag: found = i 3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while len(self.stack) > found: 3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tag = self.stack[-1] 3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method = getattr(self, 'end_' + tag) 3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except AttributeError: 3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method = None 3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if method: 3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_endtag(tag, method) 3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.unknown_endtag(tag) 3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao del self.stack[-1] 3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle start tag 3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_starttag(self, tag, method, attrs): 3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method(attrs) 3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Overridable -- handle end tag 3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_endtag(self, tag, method): 3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao method() 3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Example -- report an unbalanced </...> tag. 3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def report_unbalanced(self, tag): 3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if self.verbose: 3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print '*** Unbalanced </' + tag + '>' 3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print '*** Stack:', self.stack 3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def convert_charref(self, name): 3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Convert character reference, may be overridden.""" 3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao n = int(name) 3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except ValueError: 3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not 0 <= n <= 127: 4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return self.convert_codepoint(n) 4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def convert_codepoint(self, codepoint): 4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return chr(codepoint) 4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_charref(self, name): 4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Handle character reference, no need to override.""" 4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao replacement = self.convert_charref(name) 4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if replacement is None: 4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.unknown_charref(name) 4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(replacement) 4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Definition of entities -- derived classes may override 4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao entitydefs = \ 4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} 4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def convert_entityref(self, name): 4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Convert entity references. 4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao As an alternative to overriding this method; one can tailor the 4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao results by setting up the self.entitydefs mapping appropriately. 4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """ 4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao table = self.entitydefs 4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if name in table: 4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return table[name] 4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_entityref(self, name): 4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Handle entity references, no need to override.""" 4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao replacement = self.convert_entityref(name) 4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if replacement is None: 4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.unknown_entityref(name) 4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.handle_data(replacement) 4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Example -- handle data, should be overridden 4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_data(self, data): 4400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Example -- handle comment, could be overridden 4430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_comment(self, data): 4440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Example -- handle declaration, could be overridden 4470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_decl(self, decl): 4480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Example -- handle processing instruction, could be overridden 4510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_pi(self, data): 4520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 4530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # To be overridden -- handlers for unknown objects 4550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_starttag(self, tag, attrs): pass 4560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_endtag(self, tag): pass 4570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_charref(self, ref): pass 4580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_entityref(self, ref): pass 4590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4610a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass TestSGMLParser(SGMLParser): 4620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def __init__(self, verbose=0): 4640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.testdata = "" 4650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao SGMLParser.__init__(self, verbose) 4660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_data(self, data): 4680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.testdata = self.testdata + data 4690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(repr(self.testdata)) >= 70: 4700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 4710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def flush(self): 4730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = self.testdata 4740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if data: 4750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.testdata = "" 4760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print 'data:', repr(data) 4770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_comment(self, data): 4790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 4800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r = repr(data) 4810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(r) > 68: 4820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao r = r[:32] + '...' + r[-32:] 4830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print 'comment:', r 4840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_starttag(self, tag, attrs): 4860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 4870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not attrs: 4880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print 'start tag: <' + tag + '>' 4890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print 'start tag: <' + tag, 4910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for name, value in attrs: 4920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print name + '=' + '"' + value + '"', 4930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print '>' 4940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_endtag(self, tag): 4960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 4970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print 'end tag: </' + tag + '>' 4980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_entityref(self, ref): 5000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 5010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print '*** unknown entity ref: &' + ref + ';' 5020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_charref(self, ref): 5040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 5050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print '*** unknown char ref: &#' + ref + ';' 5060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_decl(self, data): 5080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 5090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print '*** unknown decl: [' + data + ']' 5100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def close(self): 5120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao SGMLParser.close(self) 5130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.flush() 5140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5160a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef test(args = None): 5170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao import sys 5180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if args is None: 5200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao args = sys.argv[1:] 5210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if args and args[0] == '-s': 5230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao args = args[1:] 5240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao klass = SGMLParser 5250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 5260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao klass = TestSGMLParser 5270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if args: 5290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao file = args[0] 5300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 5310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao file = 'test.html' 5320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if file == '-': 5340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao f = sys.stdin 5350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 5360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 5370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao f = open(file, 'r') 5380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except IOError, msg: 5390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao print file, ":", msg 5400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao sys.exit(1) 5410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = f.read() 5430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if f is not sys.stdin: 5440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao f.close() 5450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao x = klass() 5470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for c in data: 5480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao x.feed(c) 5490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao x.close() 5500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5520a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoif __name__ == '__main__': 5530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao test() 554