test_htmlparser.py revision c5f05e45cffa16f45f1332cec531c045893f928f
1"""Tests for HTMLParser.py."""
2
3import HTMLParser
4import pprint
5import unittest
6from test import test_support
7
8
9class EventCollector(HTMLParser.HTMLParser):
10
11    def __init__(self):
12        self.events = []
13        self.append = self.events.append
14        HTMLParser.HTMLParser.__init__(self)
15
16    def get_events(self):
17        # Normalize the list of events so that buffer artefacts don't
18        # separate runs of contiguous characters.
19        L = []
20        prevtype = None
21        for event in self.events:
22            type = event[0]
23            if type == prevtype == "data":
24                L[-1] = ("data", L[-1][1] + event[1])
25            else:
26                L.append(event)
27            prevtype = type
28        self.events = L
29        return L
30
31    # structure markup
32
33    def handle_starttag(self, tag, attrs):
34        self.append(("starttag", tag, attrs))
35
36    def handle_startendtag(self, tag, attrs):
37        self.append(("startendtag", tag, attrs))
38
39    def handle_endtag(self, tag):
40        self.append(("endtag", tag))
41
42    # all other markup
43
44    def handle_comment(self, data):
45        self.append(("comment", data))
46
47    def handle_charref(self, data):
48        self.append(("charref", data))
49
50    def handle_data(self, data):
51        self.append(("data", data))
52
53    def handle_decl(self, data):
54        self.append(("decl", data))
55
56    def handle_entityref(self, data):
57        self.append(("entityref", data))
58
59    def handle_pi(self, data):
60        self.append(("pi", data))
61
62    def unknown_decl(self, decl):
63        self.append(("unknown decl", decl))
64
65
66class EventCollectorExtra(EventCollector):
67
68    def handle_starttag(self, tag, attrs):
69        EventCollector.handle_starttag(self, tag, attrs)
70        self.append(("starttag_text", self.get_starttag_text()))
71
72
73class TestCaseBase(unittest.TestCase):
74
75    def _run_check(self, source, expected_events, collector=EventCollector):
76        parser = collector()
77        for s in source:
78            parser.feed(s)
79        parser.close()
80        events = parser.get_events()
81        if events != expected_events:
82            self.fail("received events did not match expected events\n"
83                      "Expected:\n" + pprint.pformat(expected_events) +
84                      "\nReceived:\n" + pprint.pformat(events))
85
86    def _run_check_extra(self, source, events):
87        self._run_check(source, events, EventCollectorExtra)
88
89    def _parse_error(self, source):
90        def parse(source=source):
91            parser = HTMLParser.HTMLParser()
92            parser.feed(source)
93            parser.close()
94        self.assertRaises(HTMLParser.HTMLParseError, parse)
95
96
97class HTMLParserTestCase(TestCaseBase):
98
99    def test_processing_instruction_only(self):
100        self._run_check("<?processing instruction>", [
101            ("pi", "processing instruction"),
102            ])
103        self._run_check("<?processing instruction ?>", [
104            ("pi", "processing instruction ?"),
105            ])
106
107    def test_simple_html(self):
108        self._run_check("""
109<!DOCTYPE html PUBLIC 'foo'>
110<HTML>&entity;&#32;
111<!--comment1a
112-></foo><bar>&lt;<?pi?></foo<bar
113comment1b-->
114<Img sRc='Bar' isMAP>sample
115text
116&#x201C;
117<!--comment2a-- --comment2b--><!>
118</Html>
119""", [
120    ("data", "\n"),
121    ("decl", "DOCTYPE html PUBLIC 'foo'"),
122    ("data", "\n"),
123    ("starttag", "html", []),
124    ("entityref", "entity"),
125    ("charref", "32"),
126    ("data", "\n"),
127    ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
128    ("data", "\n"),
129    ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
130    ("data", "sample\ntext\n"),
131    ("charref", "x201C"),
132    ("data", "\n"),
133    ("comment", "comment2a-- --comment2b"),
134    ("data", "\n"),
135    ("endtag", "html"),
136    ("data", "\n"),
137    ])
138
139    def test_unclosed_entityref(self):
140        self._run_check("&entityref foo", [
141            ("entityref", "entityref"),
142            ("data", " foo"),
143            ])
144
145    def test_doctype_decl(self):
146        inside = """\
147DOCTYPE html [
148  <!ELEMENT html - O EMPTY>
149  <!ATTLIST html
150      version CDATA #IMPLIED
151      profile CDATA 'DublinCore'>
152  <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
153  <!ENTITY myEntity 'internal parsed entity'>
154  <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
155  <!ENTITY % paramEntity 'name|name|name'>
156  %paramEntity;
157  <!-- comment -->
158]"""
159        self._run_check("<!%s>" % inside, [
160            ("decl", inside),
161            ])
162
163    def test_bad_nesting(self):
164        # Strangely, this *is* supposed to test that overlapping
165        # elements are allowed.  HTMLParser is more geared toward
166        # lexing the input that parsing the structure.
167        self._run_check("<a><b></a></b>", [
168            ("starttag", "a", []),
169            ("starttag", "b", []),
170            ("endtag", "a"),
171            ("endtag", "b"),
172            ])
173
174    def test_bare_ampersands(self):
175        self._run_check("this text & contains & ampersands &", [
176            ("data", "this text & contains & ampersands &"),
177            ])
178
179    def test_bare_pointy_brackets(self):
180        self._run_check("this < text > contains < bare>pointy< brackets", [
181            ("data", "this < text > contains < bare>pointy< brackets"),
182            ])
183
184    def test_attr_syntax(self):
185        output = [
186          ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
187          ]
188        self._run_check("""<a b='v' c="v" d=v e>""", output)
189        self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
190        self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
191        self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
192
193    def test_attr_values(self):
194        self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
195                        [("starttag", "a", [("b", "xxx\n\txxx"),
196                                            ("c", "yyy\t\nyyy"),
197                                            ("d", "\txyz\n")])
198                         ])
199        self._run_check("""<a b='' c="">""", [
200            ("starttag", "a", [("b", ""), ("c", "")]),
201            ])
202        # Regression test for SF patch #669683.
203        self._run_check("<e a=rgb(1,2,3)>", [
204            ("starttag", "e", [("a", "rgb(1,2,3)")]),
205            ])
206        # Regression test for SF bug #921657.
207        self._run_check("<a href=mailto:xyz@example.com>", [
208            ("starttag", "a", [("href", "mailto:xyz@example.com")]),
209            ])
210
211    def test_attr_entity_replacement(self):
212        self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
213            ("starttag", "a", [("b", "&><\"'")]),
214            ])
215
216    def test_attr_funky_names(self):
217        self._run_check("""<a a.b='v' c:d=v e-f=v>""", [
218            ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
219            ])
220
221    def test_illegal_declarations(self):
222        self._parse_error('<!spacer type="block" height="25">')
223
224    def test_starttag_end_boundary(self):
225        self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
226        self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
227
228    def test_buffer_artefacts(self):
229        output = [("starttag", "a", [("b", "<")])]
230        self._run_check(["<a b='<'>"], output)
231        self._run_check(["<a ", "b='<'>"], output)
232        self._run_check(["<a b", "='<'>"], output)
233        self._run_check(["<a b=", "'<'>"], output)
234        self._run_check(["<a b='<", "'>"], output)
235        self._run_check(["<a b='<'", ">"], output)
236
237        output = [("starttag", "a", [("b", ">")])]
238        self._run_check(["<a b='>'>"], output)
239        self._run_check(["<a ", "b='>'>"], output)
240        self._run_check(["<a b", "='>'>"], output)
241        self._run_check(["<a b=", "'>'>"], output)
242        self._run_check(["<a b='>", "'>"], output)
243        self._run_check(["<a b='>'", ">"], output)
244
245        output = [("comment", "abc")]
246        self._run_check(["", "<!--abc-->"], output)
247        self._run_check(["<", "!--abc-->"], output)
248        self._run_check(["<!", "--abc-->"], output)
249        self._run_check(["<!-", "-abc-->"], output)
250        self._run_check(["<!--", "abc-->"], output)
251        self._run_check(["<!--a", "bc-->"], output)
252        self._run_check(["<!--ab", "c-->"], output)
253        self._run_check(["<!--abc", "-->"], output)
254        self._run_check(["<!--abc-", "->"], output)
255        self._run_check(["<!--abc--", ">"], output)
256        self._run_check(["<!--abc-->", ""], output)
257
258    def test_starttag_junk_chars(self):
259        self._parse_error("</>")
260        self._parse_error("</$>")
261        self._parse_error("</")
262        self._parse_error("</a")
263        self._parse_error("<a<a>")
264        self._parse_error("</a<a>")
265        self._parse_error("<!")
266        self._parse_error("<a $>")
267        self._parse_error("<a")
268        self._parse_error("<a foo='bar'")
269        self._parse_error("<a foo='bar")
270        self._parse_error("<a foo='>'")
271        self._parse_error("<a foo='>")
272        self._parse_error("<a foo=>")
273
274    def test_declaration_junk_chars(self):
275        self._parse_error("<!DOCTYPE foo $ >")
276
277    def test_startendtag(self):
278        self._run_check("<p/>", [
279            ("startendtag", "p", []),
280            ])
281        self._run_check("<p></p>", [
282            ("starttag", "p", []),
283            ("endtag", "p"),
284            ])
285        self._run_check("<p><img src='foo' /></p>", [
286            ("starttag", "p", []),
287            ("startendtag", "img", [("src", "foo")]),
288            ("endtag", "p"),
289            ])
290
291    def test_get_starttag_text(self):
292        s = """<foo:bar   \n   one="1"\ttwo=2   >"""
293        self._run_check_extra(s, [
294            ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
295            ("starttag_text", s)])
296
297    def test_cdata_content(self):
298        s = """<script> <!-- not a comment --> &not-an-entity-ref; </script>"""
299        self._run_check(s, [
300            ("starttag", "script", []),
301            ("data", " <!-- not a comment --> &not-an-entity-ref; "),
302            ("endtag", "script"),
303            ])
304        s = """<script> <not a='start tag'> </script>"""
305        self._run_check(s, [
306            ("starttag", "script", []),
307            ("data", " <not a='start tag'> "),
308            ("endtag", "script"),
309            ])
310
311    def test_entityrefs_in_attributes(self):
312        self._run_check("<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>", [
313                ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
314                ])
315
316
317def test_main():
318    test_support.run_unittest(HTMLParserTestCase)
319
320
321if __name__ == "__main__":
322    test_main()
323