10a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport pprint 20a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport re 30a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport unittest 40a8c90248264a8b26970b4473770bcc3df8515fJosh Gaofrom test import test_support 50a8c90248264a8b26970b4473770bcc3df8515fJosh Gaosgmllib = test_support.import_module('sgmllib', deprecated=True) 60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 80a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass EventCollector(sgmllib.SGMLParser): 90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def __init__(self): 110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.events = [] 120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append = self.events.append 130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao sgmllib.SGMLParser.__init__(self) 140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def get_events(self): 160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Normalize the list of events so that buffer artefacts don't 170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # separate runs of contiguous characters. 180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao L = [] 190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prevtype = None 200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for event in self.events: 210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao type = event[0] 220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if type == prevtype == "data": 230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao L[-1] = ("data", L[-1][1] + event[1]) 240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao L.append(event) 260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prevtype = type 270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.events = L 280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return L 290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # structure markup 310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_starttag(self, tag, attrs): 330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("starttag", tag, attrs)) 340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_endtag(self, tag): 360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("endtag", tag)) 370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # all other markup 390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_comment(self, data): 410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("comment", data)) 420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_charref(self, data): 440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("charref", data)) 450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_data(self, data): 470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("data", data)) 480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_decl(self, decl): 500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("decl", decl)) 510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_entityref(self, data): 530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("entityref", data)) 540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_pi(self, data): 560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("pi", data)) 570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def unknown_decl(self, decl): 590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("unknown decl", decl)) 600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 620a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass CDATAEventCollector(EventCollector): 630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def start_cdata(self, attrs): 640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("starttag", "cdata", attrs)) 650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.setliteral() 660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 680a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass HTMLEntityCollector(EventCollector): 690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' 710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') 720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def convert_charref(self, name): 740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("charref", "convert", name)) 750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if name[0] != "x": 760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return EventCollector.convert_charref(self, name) 770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def convert_codepoint(self, codepoint): 790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("codepoint", "convert", codepoint)) 800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao EventCollector.convert_codepoint(self, codepoint) 810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def convert_entityref(self, name): 830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("entityref", "convert", name)) 840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return EventCollector.convert_entityref(self, name) 850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # These to record that they were called, then pass the call along 870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # to the default implementation so that it's actions can be 880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # recorded. 890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_charref(self, data): 910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("charref", data)) 920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao sgmllib.SGMLParser.handle_charref(self, data) 930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def handle_entityref(self, data): 950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.append(("entityref", data)) 960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao sgmllib.SGMLParser.handle_entityref(self, data) 970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 990a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass SGMLParserTestCase(unittest.TestCase): 1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao collector = EventCollector 1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def get_events(self, source): 1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao parser = self.collector() 1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for s in source: 1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao parser.feed(s) 1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao parser.close() 1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except: 1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao #self.events = parser.events 1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise 1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return parser.get_events() 1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def check_events(self, source, expected_events): 1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao events = self.get_events(source) 1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except: 1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao #import sys 1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao #print >>sys.stderr, pprint.pformat(self.events) 1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise 1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if events != expected_events: 1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.fail("received events did not match expected events\n" 1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao "Expected:\n" + pprint.pformat(expected_events) + 1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao "\nReceived:\n" + pprint.pformat(events)) 1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def check_parse_error(self, source): 1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao parser = EventCollector() 1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao parser.feed(source) 1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao parser.close() 1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except sgmllib.SGMLParseError: 1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.fail("expected SGMLParseError for %r\nReceived:\n%s" 1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao % (source, pprint.pformat(parser.get_events()))) 1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_doctype_decl_internal(self): 1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao inside = """\ 1390a8c90248264a8b26970b4473770bcc3df8515fJosh GaoDOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' 1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [ 1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!ELEMENT html - O EMPTY> 1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!ATTLIST html 1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao version CDATA #IMPLIED 1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao profile CDATA 'DublinCore'> 1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> 1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!ENTITY myEntity 'internal parsed entity'> 1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> 1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!ENTITY % paramEntity 'name|name|name'> 1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao %paramEntity; 1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao <!-- comment --> 1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao]""" 1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!%s>" % inside], [ 1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("decl", inside), 1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_doctype_decl_external(self): 1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'" 1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<!%s>" % inside, [ 1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("decl", inside), 1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_underscore_in_attrname(self): 1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SF bug #436621 1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Make sure attribute names with underscores are accepted""" 1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<a has_under _under>", [ 1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("has_under", "has_under"), 1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("_under", "_under")]), 1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_underscore_in_tagname(self): 1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SF bug #436621 1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Make sure tag names with underscores are accepted""" 1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<has_under></has_under>", [ 1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "has_under", []), 1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "has_under"), 1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_quotes_in_unquoted_attrs(self): 1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SF bug #436621 1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Be sure quotes in unquoted attributes are made part of the value""" 1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<a href=foo'bar\"baz>", [ 1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("href", "foo'bar\"baz")]), 1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_xhtml_empty_tag(self): 1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Handling of XHTML-style empty start tags""" 1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<br />text<i></i>", [ 1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "br", []), 1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", "text"), 1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "i", []), 1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "i"), 1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_processing_instruction_only(self): 1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<?processing instruction>", [ 1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("pi", "processing instruction"), 1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_bad_nesting(self): 2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<a><b></a></b>", [ 2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", []), 2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "b", []), 2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "a"), 2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "b"), 2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_bare_ampersands(self): 2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("this text & contains & ampersands &", [ 2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", "this text & contains & ampersands &"), 2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_bare_pointy_brackets(self): 2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("this < text > contains < bare>pointy< brackets", [ 2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", "this < text > contains < bare>pointy< brackets"), 2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_attr_syntax(self): 2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao output = [ 2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")]) 2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ] 2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a b='v' c="v" d=v e>""", output) 2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a b = 'v' c = "v" d = v e>""", output) 2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) 2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) 2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_attr_values(self): 2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", 2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao [("starttag", "a", [("b", "xxx\n\txxx"), 2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("c", "yyy\t\nyyy"), 2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("d", "\txyz\n")]) 2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a b='' c="">""", [ 2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("b", ""), ("c", "")]), 2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # URL construction stuff from RFC 1808: 2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao safe = "$-_.+" 2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao extra = "!*'()," 2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao reserved = ";/?:@&=" 2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao url = "http://example.com:8080/path/to/file?%s%s%s" % ( 2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao safe, extra, reserved) 2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<e a=%s>""" % url, [ 2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "e", [("a", url)]), 2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Regression test for SF patch #669683. 2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<e a=rgb(1,2,3)>", [ 2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "e", [("a", "rgb(1,2,3)")]), 2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_attr_values_entities(self): 2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao """Substitution of entities and charrefs in attribute values""" 2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SF bug #1452246 2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a b=< c=<> d=<-> e='< ' 2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao f="&xxx;" g=' !' h='Ǵ' 2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i='x?a=b&c=d;' 2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao j='&#42;' k='&#42;'>""", 2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao [("starttag", "a", [("b", "<"), 2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("c", "<>"), 2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("d", "<->"), 2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("e", "< "), 2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("f", "&xxx;"), 2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("g", " !"), 2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("h", "Ǵ"), 2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("i", "x?a=b&c=d;"), 2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("j", "*"), 2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("k", "*"), 2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ])]) 2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_convert_overrides(self): 2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # This checks that the character and entity reference 2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # conversion helpers are called at the documented times. No 2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # attempt is made to really change what the parser accepts. 2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # 2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.collector = HTMLEntityCollector 2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(('<a title="“test”">foo</a>' 2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao '&foobar;*'), [ 2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('entityref', 'convert', 'ldquo'), 2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('charref', 'convert', 'x201d'), 2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('starttag', 'a', [('title', '“test”')]), 2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('data', 'foo'), 2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('endtag', 'a'), 2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('entityref', 'foobar'), 2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('entityref', 'convert', 'foobar'), 2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('charref', '42'), 2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('charref', 'convert', '42'), 2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('codepoint', 'convert', 42), 2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_attr_funky_names(self): 2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ 2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), 2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_attr_value_ip6_url(self): 2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # http://www.python.org/sf/853506 2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>" 2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao "<a href=http://[1080::8:800:200C:417A]/>"), [ 2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), 2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), 2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_weird_starttags(self): 3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<a<a>", [ 3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", []), 3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", []), 3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("</a<a>", [ 3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "a"), 3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "a", []), 3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_declaration_junk_chars(self): 3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<!DOCTYPE foo $ >") 3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_get_starttag_text(self): 3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = """<foobar \n one="1"\ttwo=2 >""" 3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(s, [ 3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "foobar", [("one", "1"), ("two", "2")]), 3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_cdata_content(self): 3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = ("<cdata> <!-- not a comment --> ¬-an-entity-ref; </cdata>" 3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao "<notcdata> <!-- comment --> </notcdata>") 3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.collector = CDATAEventCollector 3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(s, [ 3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "cdata", []), 3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", " <!-- not a comment --> ¬-an-entity-ref; "), 3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "cdata"), 3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "notcdata", []), 3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", " "), 3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("comment", " comment "), 3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", " "), 3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "notcdata"), 3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = """<cdata> <not a='start tag'> </cdata>""" 3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(s, [ 3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("starttag", "cdata", []), 3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", " <not a='start tag'> "), 3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("endtag", "cdata"), 3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_illegal_declarations(self): 3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = 'abc<!spacer type="block" height="25">def' 3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(s, [ 3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", "abc"), 3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("unknown decl", 'spacer type="block" height="25"'), 3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ("data", "def"), 3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_enumerated_attr_type(self): 3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>" 3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(s, [ 3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'), 3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_read_chunks(self): 3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SF bug #1541697, this caused sgml parser to hang 3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Just verify this code doesn't cause a hang. 3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao CHUNK = 1024 # increasing this to 8212 makes the problem go away 3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao f = open(test_support.findfile('sgml_input.html')) 3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao fp = sgmllib.SGMLParser() 3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while 1: 3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = f.read(CHUNK) 3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao fp.feed(data) 3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(data) != CHUNK: 3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def test_only_decode_ascii(self): 3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # SF bug #1651995, make sure non-ascii character references are not decoded 3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao s = '<signs exclamation="!" copyright="©" quoteleft="‘">' 3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(s, [ 3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('starttag', 'signs', 3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao [('exclamation', '!'), ('copyright', '©'), 3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ('quoteleft', '‘')]), 3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ]) 3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX These tests have been disabled by prefixing their names with 3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # an underscore. The first two exercise outstanding bugs in the 3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # sgmllib module, and the third exhibits questionable behavior 3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # that needs to be carefully considered before changing it. 3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def _test_starttag_end_boundary(self): 3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])]) 3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])]) 3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def _test_buffer_artefacts(self): 3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao output = [("starttag", "a", [("b", "<")])] 3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b='<'>"], output) 3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a ", "b='<'>"], output) 3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b", "='<'>"], output) 3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b=", "'<'>"], output) 3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b='<", "'>"], output) 3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b='<'", ">"], output) 3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao output = [("starttag", "a", [("b", ">")])] 3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b='>'>"], output) 3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a ", "b='>'>"], output) 3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b", "='>'>"], output) 3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b=", "'>'>"], output) 4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b='>", "'>"], output) 4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<a b='>'", ">"], output) 4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao output = [("comment", "abc")] 4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["", "<!--abc-->"], output) 4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<", "!--abc-->"], output) 4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!", "--abc-->"], output) 4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!-", "-abc-->"], output) 4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--", "abc-->"], output) 4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--a", "bc-->"], output) 4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--ab", "c-->"], output) 4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--abc", "-->"], output) 4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--abc-", "->"], output) 4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--abc--", ">"], output) 4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_events(["<!--abc-->", ""], output) 4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def _test_starttag_junk_chars(self): 4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<") 4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<>") 4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("</$>") 4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("</") 4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("</a") 4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<$") 4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<$>") 4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<!") 4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a $>") 4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a") 4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a foo='bar'") 4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a foo='bar") 4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a foo='>'") 4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a foo='>") 4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao self.check_parse_error("<a foo=>") 4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef test_main(): 4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao test_support.run_unittest(SGMLParserTestCase) 4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoif __name__ == "__main__": 4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao test_main() 440