10a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport pprint
20a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport re
30a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport unittest
40a8c90248264a8b26970b4473770bcc3df8515fJosh Gaofrom test import test_support
50a8c90248264a8b26970b4473770bcc3df8515fJosh Gaosgmllib = test_support.import_module('sgmllib', deprecated=True)
60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
80a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass EventCollector(sgmllib.SGMLParser):
90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def __init__(self):
110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.events = []
120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append = self.events.append
130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        sgmllib.SGMLParser.__init__(self)
140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def get_events(self):
160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # Normalize the list of events so that buffer artefacts don't
170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # separate runs of contiguous characters.
180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        L = []
190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        prevtype = None
200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        for event in self.events:
210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            type = event[0]
220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if type == prevtype == "data":
230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                L[-1] = ("data", L[-1][1] + event[1])
240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            else:
250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                L.append(event)
260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            prevtype = type
270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.events = L
280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return L
290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # structure markup
310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_starttag(self, tag, attrs):
330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("starttag", tag, attrs))
340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_endtag(self, tag):
360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("endtag", tag))
370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # all other markup
390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_comment(self, data):
410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("comment", data))
420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_charref(self, data):
440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("charref", data))
450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_data(self, data):
470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("data", data))
480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_decl(self, decl):
500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("decl", decl))
510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_entityref(self, data):
530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("entityref", data))
540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_pi(self, data):
560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("pi", data))
570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def unknown_decl(self, decl):
590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("unknown decl", decl))
600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
620a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass CDATAEventCollector(EventCollector):
630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def start_cdata(self, attrs):
640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("starttag", "cdata", attrs))
650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.setliteral()
660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
680a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass HTMLEntityCollector(EventCollector):
690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def convert_charref(self, name):
740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("charref", "convert", name))
750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if name[0] != "x":
760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            return EventCollector.convert_charref(self, name)
770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def convert_codepoint(self, codepoint):
790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("codepoint", "convert", codepoint))
800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        EventCollector.convert_codepoint(self, codepoint)
810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def convert_entityref(self, name):
830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("entityref", "convert", name))
840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return EventCollector.convert_entityref(self, name)
850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # These to record that they were called, then pass the call along
870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # to the default implementation so that it's actions can be
880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # recorded.
890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_charref(self, data):
910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("charref", data))
920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        sgmllib.SGMLParser.handle_charref(self, data)
930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def handle_entityref(self, data):
950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.append(("entityref", data))
960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        sgmllib.SGMLParser.handle_entityref(self, data)
970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
990a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoclass SGMLParserTestCase(unittest.TestCase):
1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    collector = EventCollector
1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def get_events(self, source):
1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        parser = self.collector()
1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        try:
1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            for s in source:
1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                parser.feed(s)
1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            parser.close()
1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        except:
1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            #self.events = parser.events
1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            raise
1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        return parser.get_events()
1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def check_events(self, source, expected_events):
1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        try:
1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            events = self.get_events(source)
1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        except:
1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            #import sys
1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            #print >>sys.stderr, pprint.pformat(self.events)
1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            raise
1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        if events != expected_events:
1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.fail("received events did not match expected events\n"
1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                      "Expected:\n" + pprint.pformat(expected_events) +
1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                      "\nReceived:\n" + pprint.pformat(events))
1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def check_parse_error(self, source):
1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        parser = EventCollector()
1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        try:
1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            parser.feed(source)
1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            parser.close()
1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        except sgmllib.SGMLParseError:
1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            pass
1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        else:
1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            self.fail("expected SGMLParseError for %r\nReceived:\n%s"
1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                      % (source, pprint.pformat(parser.get_events())))
1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_doctype_decl_internal(self):
1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        inside = """\
1390a8c90248264a8b26970b4473770bcc3df8515fJosh GaoDOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao             SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!ELEMENT html - O EMPTY>
1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!ATTLIST html
1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao      version CDATA #IMPLIED
1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao      profile CDATA 'DublinCore'>
1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!ENTITY myEntity 'internal parsed entity'>
1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!ENTITY % paramEntity 'name|name|name'>
1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  %paramEntity;
1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao  <!-- comment -->
1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao]"""
1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!%s>" % inside], [
1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("decl", inside),
1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_doctype_decl_external(self):
1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<!%s>" % inside, [
1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("decl", inside),
1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_underscore_in_attrname(self):
1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # SF bug #436621
1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Make sure attribute names with underscores are accepted"""
1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<a has_under _under>", [
1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", [("has_under", "has_under"),
1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                               ("_under", "_under")]),
1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_underscore_in_tagname(self):
1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # SF bug #436621
1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Make sure tag names with underscores are accepted"""
1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<has_under></has_under>", [
1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "has_under", []),
1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "has_under"),
1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_quotes_in_unquoted_attrs(self):
1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # SF bug #436621
1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Be sure quotes in unquoted attributes are made part of the value"""
1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<a href=foo'bar\"baz>", [
1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", [("href", "foo'bar\"baz")]),
1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_xhtml_empty_tag(self):
1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Handling of XHTML-style empty start tags"""
1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<br />text<i></i>", [
1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "br", []),
1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", "text"),
1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "i", []),
1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "i"),
1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_processing_instruction_only(self):
1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<?processing instruction>", [
1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("pi", "processing instruction"),
1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_bad_nesting(self):
2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<a><b></a></b>", [
2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", []),
2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "b", []),
2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "a"),
2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "b"),
2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_bare_ampersands(self):
2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("this text & contains & ampersands &", [
2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", "this text & contains & ampersands &"),
2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_bare_pointy_brackets(self):
2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("this < text > contains < bare>pointy< brackets", [
2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", "this < text > contains < bare>pointy< brackets"),
2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_attr_syntax(self):
2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        output = [
2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao          ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao          ]
2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a b='v' c="v" d=v e>""", output)
2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_attr_values(self):
2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                        [("starttag", "a", [("b", "xxx\n\txxx"),
2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                            ("c", "yyy\t\nyyy"),
2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                            ("d", "\txyz\n")])
2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                         ])
2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a b='' c="">""", [
2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", [("b", ""), ("c", "")]),
2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # URL construction stuff from RFC 1808:
2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        safe = "$-_.+"
2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        extra = "!*'(),"
2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        reserved = ";/?:@&="
2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        url = "http://example.com:8080/path/to/file?%s%s%s" % (
2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            safe, extra, reserved)
2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<e a=%s>""" % url, [
2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "e", [("a", url)]),
2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # Regression test for SF patch #669683.
2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<e a=rgb(1,2,3)>", [
2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "e", [("a", "rgb(1,2,3)")]),
2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_attr_values_entities(self):
2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        """Substitution of entities and charrefs in attribute values"""
2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # SF bug #1452246
2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                f="&xxx;" g='&#32;&#33;' h='&#500;'
2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                i='x?a=b&c=d;'
2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                j='&amp;#42;' k='&#38;#42;'>""",
2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            [("starttag", "a", [("b", "<"),
2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("c", "<>"),
2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("d", "&lt->"),
2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("e", "< "),
2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("f", "&xxx;"),
2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("g", " !"),
2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("h", "&#500;"),
2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("i", "x?a=b&c=d;"),
2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("j", "&#42;"),
2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ("k", "&#42;"),
2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                                ])])
2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_convert_overrides(self):
2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # This checks that the character and entity reference
2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # conversion helpers are called at the documented times.  No
2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # attempt is made to really change what the parser accepts.
2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        #
2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.collector = HTMLEntityCollector
2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                           '&foobar;&#42;'), [
2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('entityref', 'convert', 'ldquo'),
2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('charref', 'convert', 'x201d'),
2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('data', 'foo'),
2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('endtag', 'a'),
2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('entityref', 'foobar'),
2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('entityref', 'convert', 'foobar'),
2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('charref', '42'),
2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('charref', 'convert', '42'),
2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('codepoint', 'convert', 42),
2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_attr_funky_names(self):
2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_attr_value_ip6_url(self):
2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # http://www.python.org/sf/853506
2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                           "<a href=http://[1080::8:800:200C:417A]/>"), [
2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_weird_starttags(self):
3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<a<a>", [
3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", []),
3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", []),
3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("</a<a>", [
3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "a"),
3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "a", []),
3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_declaration_junk_chars(self):
3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<!DOCTYPE foo $ >")
3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_get_starttag_text(self):
3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        s = """<foobar   \n   one="1"\ttwo=2   >"""
3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(s, [
3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "foobar", [("one", "1"), ("two", "2")]),
3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_cdata_content(self):
3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao             "<notcdata> <!-- comment --> </notcdata>")
3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.collector = CDATAEventCollector
3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(s, [
3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "cdata", []),
3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", " <!-- not a comment --> &not-an-entity-ref; "),
3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "cdata"),
3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "notcdata", []),
3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", " "),
3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("comment", " comment "),
3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", " "),
3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "notcdata"),
3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        s = """<cdata> <not a='start tag'> </cdata>"""
3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(s, [
3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("starttag", "cdata", []),
3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", " <not a='start tag'> "),
3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("endtag", "cdata"),
3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_illegal_declarations(self):
3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        s = 'abc<!spacer type="block" height="25">def'
3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(s, [
3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", "abc"),
3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("unknown decl", 'spacer type="block" height="25"'),
3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ("data", "def"),
3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_enumerated_attr_type(self):
3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(s, [
3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_read_chunks(self):
3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # SF bug #1541697, this caused sgml parser to hang
3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # Just verify this code doesn't cause a hang.
3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        CHUNK = 1024  # increasing this to 8212 makes the problem go away
3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        f = open(test_support.findfile('sgml_input.html'))
3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        fp = sgmllib.SGMLParser()
3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        while 1:
3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            data = f.read(CHUNK)
3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            fp.feed(data)
3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            if len(data) != CHUNK:
3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao                break
3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def test_only_decode_ascii(self):
3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        # SF bug #1651995, make sure non-ascii character references are not decoded
3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        s = '<signs exclamation="&#33" copyright="&#169" quoteleft="&#8216;">'
3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(s, [
3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ('starttag', 'signs',
3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao             [('exclamation', '!'), ('copyright', '&#169'),
3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao              ('quoteleft', '&#8216;')]),
3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao            ])
3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # XXX These tests have been disabled by prefixing their names with
3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # an underscore.  The first two exercise outstanding bugs in the
3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # sgmllib module, and the third exhibits questionable behavior
3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    # that needs to be carefully considered before changing it.
3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def _test_starttag_end_boundary(self):
3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def _test_buffer_artefacts(self):
3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        output = [("starttag", "a", [("b", "<")])]
3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b='<'>"], output)
3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a ", "b='<'>"], output)
3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b", "='<'>"], output)
3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b=", "'<'>"], output)
3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b='<", "'>"], output)
3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b='<'", ">"], output)
3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        output = [("starttag", "a", [("b", ">")])]
3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b='>'>"], output)
3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a ", "b='>'>"], output)
3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b", "='>'>"], output)
3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b=", "'>'>"], output)
4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b='>", "'>"], output)
4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<a b='>'", ">"], output)
4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        output = [("comment", "abc")]
4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["", "<!--abc-->"], output)
4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<", "!--abc-->"], output)
4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!", "--abc-->"], output)
4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!-", "-abc-->"], output)
4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--", "abc-->"], output)
4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--a", "bc-->"], output)
4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--ab", "c-->"], output)
4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--abc", "-->"], output)
4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--abc-", "->"], output)
4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--abc--", ">"], output)
4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_events(["<!--abc-->", ""], output)
4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    def _test_starttag_junk_chars(self):
4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<")
4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<>")
4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("</$>")
4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("</")
4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("</a")
4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<$")
4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<$>")
4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<!")
4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a $>")
4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a")
4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a foo='bar'")
4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a foo='bar")
4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a foo='>'")
4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a foo='>")
4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao        self.check_parse_error("<a foo=>")
4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef test_main():
4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    test_support.run_unittest(SGMLParserTestCase)
4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao
4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoif __name__ == "__main__":
4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao    test_main()
440