test_htmlparser.py revision 75d9a62fe031195742b6d9f0647458af0fbd96f0
1"""Tests for HTMLParser.py.""" 2 3import HTMLParser 4import pprint 5import sys 6import unittest 7from test import test_support 8 9 10class EventCollector(HTMLParser.HTMLParser): 11 12 def __init__(self): 13 self.events = [] 14 self.append = self.events.append 15 HTMLParser.HTMLParser.__init__(self) 16 17 def get_events(self): 18 # Normalize the list of events so that buffer artefacts don't 19 # separate runs of contiguous characters. 20 L = [] 21 prevtype = None 22 for event in self.events: 23 type = event[0] 24 if type == prevtype == "data": 25 L[-1] = ("data", L[-1][1] + event[1]) 26 else: 27 L.append(event) 28 prevtype = type 29 self.events = L 30 return L 31 32 # structure markup 33 34 def handle_starttag(self, tag, attrs): 35 self.append(("starttag", tag, attrs)) 36 37 def handle_startendtag(self, tag, attrs): 38 self.append(("startendtag", tag, attrs)) 39 40 def handle_endtag(self, tag): 41 self.append(("endtag", tag)) 42 43 # all other markup 44 45 def handle_comment(self, data): 46 self.append(("comment", data)) 47 48 def handle_charref(self, data): 49 self.append(("charref", data)) 50 51 def handle_data(self, data): 52 self.append(("data", data)) 53 54 def handle_decl(self, data): 55 self.append(("decl", data)) 56 57 def handle_entityref(self, data): 58 self.append(("entityref", data)) 59 60 def handle_pi(self, data): 61 self.append(("pi", data)) 62 63 def unknown_decl(self, decl): 64 self.append(("unknown decl", decl)) 65 66 67class EventCollectorExtra(EventCollector): 68 69 def handle_starttag(self, tag, attrs): 70 EventCollector.handle_starttag(self, tag, attrs) 71 self.append(("starttag_text", self.get_starttag_text())) 72 73 74class TestCaseBase(unittest.TestCase): 75 76 def _run_check(self, source, expected_events, collector=EventCollector): 77 parser = collector() 78 for s in source: 79 parser.feed(s) 80 parser.close() 81 events = parser.get_events() 82 if events != expected_events: 83 self.fail("received events did not match expected events\n" 84 "Expected:\n" + pprint.pformat(expected_events) + 85 "\nReceived:\n" + pprint.pformat(events)) 86 87 def _run_check_extra(self, source, events): 88 self._run_check(source, events, EventCollectorExtra) 89 90 def _parse_error(self, source): 91 def parse(source=source): 92 parser = HTMLParser.HTMLParser() 93 parser.feed(source) 94 parser.close() 95 self.assertRaises(HTMLParser.HTMLParseError, parse) 96 97 98class HTMLParserTestCase(TestCaseBase): 99 100 def test_processing_instruction_only(self): 101 self._run_check("<?processing instruction>", [ 102 ("pi", "processing instruction"), 103 ]) 104 self._run_check("<?processing instruction ?>", [ 105 ("pi", "processing instruction ?"), 106 ]) 107 108 def test_simple_html(self): 109 self._run_check(""" 110<!DOCTYPE html PUBLIC 'foo'> 111<HTML>&entity;  112<!--comment1a 113-></foo><bar><<?pi?></foo<bar 114comment1b--> 115<Img sRc='Bar' isMAP>sample 116text 117“ 118<!--comment2a-- --comment2b--> 119</Html> 120""", [ 121 ("data", "\n"), 122 ("decl", "DOCTYPE html PUBLIC 'foo'"), 123 ("data", "\n"), 124 ("starttag", "html", []), 125 ("entityref", "entity"), 126 ("charref", "32"), 127 ("data", "\n"), 128 ("comment", "comment1a\n-></foo><bar><<?pi?></foo<bar\ncomment1b"), 129 ("data", "\n"), 130 ("starttag", "img", [("src", "Bar"), ("ismap", None)]), 131 ("data", "sample\ntext\n"), 132 ("charref", "x201C"), 133 ("data", "\n"), 134 ("comment", "comment2a-- --comment2b"), 135 ("data", "\n"), 136 ("endtag", "html"), 137 ("data", "\n"), 138 ]) 139 140 def test_unclosed_entityref(self): 141 self._run_check("&entityref foo", [ 142 ("entityref", "entityref"), 143 ("data", " foo"), 144 ]) 145 146 def test_doctype_decl(self): 147 inside = """\ 148DOCTYPE html [ 149 <!ELEMENT html - O EMPTY> 150 <!ATTLIST html 151 version CDATA #IMPLIED 152 profile CDATA 'DublinCore'> 153 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> 154 <!ENTITY myEntity 'internal parsed entity'> 155 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> 156 <!ENTITY % paramEntity 'name|name|name'> 157 %paramEntity; 158 <!-- comment --> 159]""" 160 self._run_check("<!%s>" % inside, [ 161 ("decl", inside), 162 ]) 163 164 def test_bad_nesting(self): 165 # Strangely, this *is* supposed to test that overlapping 166 # elements are allowed. HTMLParser is more geared toward 167 # lexing the input that parsing the structure. 168 self._run_check("<a><b></a></b>", [ 169 ("starttag", "a", []), 170 ("starttag", "b", []), 171 ("endtag", "a"), 172 ("endtag", "b"), 173 ]) 174 175 def test_bare_ampersands(self): 176 self._run_check("this text & contains & ampersands &", [ 177 ("data", "this text & contains & ampersands &"), 178 ]) 179 180 def test_bare_pointy_brackets(self): 181 self._run_check("this < text > contains < bare>pointy< brackets", [ 182 ("data", "this < text > contains < bare>pointy< brackets"), 183 ]) 184 185 def test_attr_syntax(self): 186 output = [ 187 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) 188 ] 189 self._run_check("""<a b='v' c="v" d=v e>""", output) 190 self._run_check("""<a b = 'v' c = "v" d = v e>""", output) 191 self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) 192 self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) 193 194 def test_attr_values(self): 195 self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", 196 [("starttag", "a", [("b", "xxx\n\txxx"), 197 ("c", "yyy\t\nyyy"), 198 ("d", "\txyz\n")]) 199 ]) 200 self._run_check("""<a b='' c="">""", [ 201 ("starttag", "a", [("b", ""), ("c", "")]), 202 ]) 203 # Regression test for SF patch #669683. 204 self._run_check("<e a=rgb(1,2,3)>", [ 205 ("starttag", "e", [("a", "rgb(1,2,3)")]), 206 ]) 207 # Regression test for SF bug #921657. 208 self._run_check("<a href=mailto:xyz@example.com>", [ 209 ("starttag", "a", [("href", "mailto:xyz@example.com")]), 210 ]) 211 212 def test_attr_entity_replacement(self): 213 self._run_check("""<a b='&><"''>""", [ 214 ("starttag", "a", [("b", "&><\"'")]), 215 ]) 216 217 def test_attr_funky_names(self): 218 self._run_check("""<a a.b='v' c:d=v e-f=v>""", [ 219 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), 220 ]) 221 222 def test_illegal_declarations(self): 223 self._parse_error('<!spacer type="block" height="25">') 224 225 def test_starttag_end_boundary(self): 226 self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) 227 self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) 228 229 def test_buffer_artefacts(self): 230 output = [("starttag", "a", [("b", "<")])] 231 self._run_check(["<a b='<'>"], output) 232 self._run_check(["<a ", "b='<'>"], output) 233 self._run_check(["<a b", "='<'>"], output) 234 self._run_check(["<a b=", "'<'>"], output) 235 self._run_check(["<a b='<", "'>"], output) 236 self._run_check(["<a b='<'", ">"], output) 237 238 output = [("starttag", "a", [("b", ">")])] 239 self._run_check(["<a b='>'>"], output) 240 self._run_check(["<a ", "b='>'>"], output) 241 self._run_check(["<a b", "='>'>"], output) 242 self._run_check(["<a b=", "'>'>"], output) 243 self._run_check(["<a b='>", "'>"], output) 244 self._run_check(["<a b='>'", ">"], output) 245 246 output = [("comment", "abc")] 247 self._run_check(["", "<!--abc-->"], output) 248 self._run_check(["<", "!--abc-->"], output) 249 self._run_check(["<!", "--abc-->"], output) 250 self._run_check(["<!-", "-abc-->"], output) 251 self._run_check(["<!--", "abc-->"], output) 252 self._run_check(["<!--a", "bc-->"], output) 253 self._run_check(["<!--ab", "c-->"], output) 254 self._run_check(["<!--abc", "-->"], output) 255 self._run_check(["<!--abc-", "->"], output) 256 self._run_check(["<!--abc--", ">"], output) 257 self._run_check(["<!--abc-->", ""], output) 258 259 def test_starttag_junk_chars(self): 260 self._parse_error("</>") 261 self._parse_error("</$>") 262 self._parse_error("</") 263 self._parse_error("</a") 264 self._parse_error("<a<a>") 265 self._parse_error("</a<a>") 266 self._parse_error("<!") 267 self._parse_error("<a $>") 268 self._parse_error("<a") 269 self._parse_error("<a foo='bar'") 270 self._parse_error("<a foo='bar") 271 self._parse_error("<a foo='>'") 272 self._parse_error("<a foo='>") 273 self._parse_error("<a foo=>") 274 275 def test_declaration_junk_chars(self): 276 self._parse_error("<!DOCTYPE foo $ >") 277 278 def test_startendtag(self): 279 self._run_check("<p/>", [ 280 ("startendtag", "p", []), 281 ]) 282 self._run_check("<p></p>", [ 283 ("starttag", "p", []), 284 ("endtag", "p"), 285 ]) 286 self._run_check("<p><img src='foo' /></p>", [ 287 ("starttag", "p", []), 288 ("startendtag", "img", [("src", "foo")]), 289 ("endtag", "p"), 290 ]) 291 292 def test_get_starttag_text(self): 293 s = """<foo:bar \n one="1"\ttwo=2 >""" 294 self._run_check_extra(s, [ 295 ("starttag", "foo:bar", [("one", "1"), ("two", "2")]), 296 ("starttag_text", s)]) 297 298 def test_cdata_content(self): 299 s = """<script> <!-- not a comment --> ¬-an-entity-ref; </script>""" 300 self._run_check(s, [ 301 ("starttag", "script", []), 302 ("data", " <!-- not a comment --> ¬-an-entity-ref; "), 303 ("endtag", "script"), 304 ]) 305 s = """<script> <not a='start tag'> </script>""" 306 self._run_check(s, [ 307 ("starttag", "script", []), 308 ("data", " <not a='start tag'> "), 309 ("endtag", "script"), 310 ]) 311 312 313def test_main(): 314 test_support.run_unittest(HTMLParserTestCase) 315 316 317if __name__ == "__main__": 318 test_main() 319