test_pyexpat.py revision b44155483515123a320ae57ec4b7ab237f991362
1# XXX TypeErrors on calling handlers, or on bad return values from a
2# handler, are obscure and unhelpful.
3
4from io import BytesIO
5import sys
6import unittest
7
8from xml.parsers import expat
9
10from test.support import sortdict, run_unittest
11
12
13class SetAttributeTest(unittest.TestCase):
14    def setUp(self):
15        self.parser = expat.ParserCreate(namespace_separator='!')
16        self.set_get_pairs = [
17            [0, 0],
18            [1, 1],
19            [2, 1],
20            [0, 0],
21            ]
22
23    def test_ordered_attributes(self):
24        for x, y in self.set_get_pairs:
25            self.parser.ordered_attributes = x
26            self.assertEquals(self.parser.ordered_attributes, y)
27
28    def test_specified_attributes(self):
29        for x, y in self.set_get_pairs:
30            self.parser.specified_attributes = x
31            self.assertEquals(self.parser.specified_attributes, y)
32
33
34data = b'''\
35<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
36<?xml-stylesheet href="stylesheet.css"?>
37<!-- comment data -->
38<!DOCTYPE quotations SYSTEM "quotations.dtd" [
39<!ELEMENT root ANY>
40<!ATTLIST root attr1 CDATA #REQUIRED attr2 CDATA #IMPLIED>
41<!NOTATION notation SYSTEM "notation.jpeg">
42<!ENTITY acirc "&#226;">
43<!ENTITY external_entity SYSTEM "entity.file">
44<!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation>
45%unparsed_entity;
46]>
47
48<root attr1="value1" attr2="value2&#8000;">
49<myns:subelement xmlns:myns="http://www.python.org/namespace">
50     Contents of subelements
51</myns:subelement>
52<sub2><![CDATA[contents of CDATA section]]></sub2>
53&external_entity;
54&skipped_entity;
55</root>
56'''
57
58
59# Produce UTF-8 output
60class ParseTest(unittest.TestCase):
61    class Outputter:
62        def __init__(self):
63            self.out = []
64
65        def StartElementHandler(self, name, attrs):
66            self.out.append('Start element: ' + repr(name) + ' ' +
67                            sortdict(attrs))
68
69        def EndElementHandler(self, name):
70            self.out.append('End element: ' + repr(name))
71
72        def CharacterDataHandler(self, data):
73            data = data.strip()
74            if data:
75                self.out.append('Character data: ' + repr(data))
76
77        def ProcessingInstructionHandler(self, target, data):
78            self.out.append('PI: ' + repr(target) + ' ' + repr(data))
79
80        def StartNamespaceDeclHandler(self, prefix, uri):
81            self.out.append('NS decl: ' + repr(prefix) + ' ' + repr(uri))
82
83        def EndNamespaceDeclHandler(self, prefix):
84            self.out.append('End of NS decl: ' + repr(prefix))
85
86        def StartCdataSectionHandler(self):
87            self.out.append('Start of CDATA section')
88
89        def EndCdataSectionHandler(self):
90            self.out.append('End of CDATA section')
91
92        def CommentHandler(self, text):
93            self.out.append('Comment: ' + repr(text))
94
95        def NotationDeclHandler(self, *args):
96            name, base, sysid, pubid = args
97            self.out.append('Notation declared: %s' %(args,))
98
99        def UnparsedEntityDeclHandler(self, *args):
100            entityName, base, systemId, publicId, notationName = args
101            self.out.append('Unparsed entity decl: %s' %(args,))
102
103        def NotStandaloneHandler(self):
104            self.out.append('Not standalone')
105            return 1
106
107        def ExternalEntityRefHandler(self, *args):
108            context, base, sysId, pubId = args
109            self.out.append('External entity ref: %s' %(args[1:],))
110            return 1
111
112        def StartDoctypeDeclHandler(self, *args):
113            self.out.append(('Start doctype', args))
114            return 1
115
116        def EndDoctypeDeclHandler(self):
117            self.out.append("End doctype")
118            return 1
119
120        def EntityDeclHandler(self, *args):
121            self.out.append(('Entity declaration', args))
122            return 1
123
124        def XmlDeclHandler(self, *args):
125            self.out.append(('XML declaration', args))
126            return 1
127
128        def ElementDeclHandler(self, *args):
129            self.out.append(('Element declaration', args))
130            return 1
131
132        def AttlistDeclHandler(self, *args):
133            self.out.append(('Attribute list declaration', args))
134            return 1
135
136        def SkippedEntityHandler(self, *args):
137            self.out.append(("Skipped entity", args))
138            return 1
139
140        def DefaultHandler(self, userData):
141            pass
142
143        def DefaultHandlerExpand(self, userData):
144            pass
145
146    handler_names = [
147        'StartElementHandler', 'EndElementHandler', 'CharacterDataHandler',
148        'ProcessingInstructionHandler', 'UnparsedEntityDeclHandler',
149        'NotationDeclHandler', 'StartNamespaceDeclHandler',
150        'EndNamespaceDeclHandler', 'CommentHandler',
151        'StartCdataSectionHandler', 'EndCdataSectionHandler', 'DefaultHandler',
152        'DefaultHandlerExpand', 'NotStandaloneHandler',
153        'ExternalEntityRefHandler', 'StartDoctypeDeclHandler',
154        'EndDoctypeDeclHandler', 'EntityDeclHandler', 'XmlDeclHandler',
155        'ElementDeclHandler', 'AttlistDeclHandler', 'SkippedEntityHandler',
156        ]
157
158    def _verify_parse_output(self, operations):
159        expected_operations = [
160            ('XML declaration', ('1.0', 'iso-8859-1', 0)),
161            'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'',
162            "Comment: ' comment data '",
163            "Not standalone",
164            ("Start doctype", ('quotations', 'quotations.dtd', None, 1)),
165            ('Element declaration', ('root', (2, 0, None, ()))),
166            ('Attribute list declaration', ('root', 'attr1', 'CDATA', None,
167                1)),
168            ('Attribute list declaration', ('root', 'attr2', 'CDATA', None,
169                0)),
170            "Notation declared: ('notation', None, 'notation.jpeg', None)",
171            ('Entity declaration', ('acirc', 0, '\xe2', None, None, None, None)),
172            ('Entity declaration', ('external_entity', 0, None, None,
173                'entity.file', None, None)),
174            "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')",
175            "Not standalone",
176            "End doctype",
177            "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\u1f40'}",
178            "NS decl: 'myns' 'http://www.python.org/namespace'",
179            "Start element: 'http://www.python.org/namespace!subelement' {}",
180            "Character data: 'Contents of subelements'",
181            "End element: 'http://www.python.org/namespace!subelement'",
182            "End of NS decl: 'myns'",
183            "Start element: 'sub2' {}",
184            'Start of CDATA section',
185            "Character data: 'contents of CDATA section'",
186            'End of CDATA section',
187            "End element: 'sub2'",
188            "External entity ref: (None, 'entity.file', None)",
189            ('Skipped entity', ('skipped_entity', 0)),
190            "End element: 'root'",
191        ]
192        for operation, expected_operation in zip(operations, expected_operations):
193            self.assertEquals(operation, expected_operation)
194
195    def test_unicode(self):
196        # Try the parse again, this time producing Unicode output
197        out = self.Outputter()
198        parser = expat.ParserCreate(namespace_separator='!')
199        for name in self.handler_names:
200            setattr(parser, name, getattr(out, name))
201
202        parser.Parse(data, 1)
203
204        operations = out.out
205        self._verify_parse_output(operations)
206
207    def test_parse_file(self):
208        # Try parsing a file
209        out = self.Outputter()
210        parser = expat.ParserCreate(namespace_separator='!')
211        for name in self.handler_names:
212            setattr(parser, name, getattr(out, name))
213        file = BytesIO(data)
214
215        parser.ParseFile(file)
216
217        operations = out.out
218        self._verify_parse_output(operations)
219
220class NamespaceSeparatorTest(unittest.TestCase):
221    def test_legal(self):
222        # Tests that make sure we get errors when the namespace_separator value
223        # is illegal, and that we don't for good values:
224        expat.ParserCreate()
225        expat.ParserCreate(namespace_separator=None)
226        expat.ParserCreate(namespace_separator=' ')
227
228    def test_illegal(self):
229        try:
230            expat.ParserCreate(namespace_separator=42)
231            self.fail()
232        except TypeError as e:
233            self.assertEquals(str(e),
234                'ParserCreate() argument 2 must be str or None, not int')
235
236        try:
237            expat.ParserCreate(namespace_separator='too long')
238            self.fail()
239        except ValueError as e:
240            self.assertEquals(str(e),
241                'namespace_separator must be at most one character, omitted, or None')
242
243    def test_zero_length(self):
244        # ParserCreate() needs to accept a namespace_separator of zero length
245        # to satisfy the requirements of RDF applications that are required
246        # to simply glue together the namespace URI and the localname.  Though
247        # considered a wart of the RDF specifications, it needs to be supported.
248        #
249        # See XML-SIG mailing list thread starting with
250        # http://mail.python.org/pipermail/xml-sig/2001-April/005202.html
251        #
252        expat.ParserCreate(namespace_separator='') # too short
253
254
255class InterningTest(unittest.TestCase):
256    def test(self):
257        # Test the interning machinery.
258        p = expat.ParserCreate()
259        L = []
260        def collector(name, *args):
261            L.append(name)
262        p.StartElementHandler = collector
263        p.EndElementHandler = collector
264        p.Parse("<e> <e/> <e></e> </e>", 1)
265        tag = L[0]
266        self.assertEquals(len(L), 6)
267        for entry in L:
268            # L should have the same string repeated over and over.
269            self.assertTrue(tag is entry)
270
271    def test_issue9402(self):
272        # create an ExternalEntityParserCreate with buffer text
273        class ExternalOutputter:
274            def __init__(self, parser):
275                self.parser = parser
276                self.parser_result = None
277
278            def ExternalEntityRefHandler(self, context, base, sysId, pubId):
279                external_parser = self.parser.ExternalEntityParserCreate("")
280                self.parser_result = external_parser.Parse("", 1)
281                return 1
282
283        parser = expat.ParserCreate(namespace_separator='!')
284        parser.buffer_text = 1
285        out = ExternalOutputter(parser)
286        parser.ExternalEntityRefHandler = out.ExternalEntityRefHandler
287        parser.Parse(data, 1)
288        self.assertEquals(out.parser_result, 1)
289
290
291class BufferTextTest(unittest.TestCase):
292    def setUp(self):
293        self.stuff = []
294        self.parser = expat.ParserCreate()
295        self.parser.buffer_text = 1
296        self.parser.CharacterDataHandler = self.CharacterDataHandler
297
298    def check(self, expected, label):
299        self.assertEquals(self.stuff, expected,
300                "%s\nstuff    = %r\nexpected = %r"
301                % (label, self.stuff, map(str, expected)))
302
303    def CharacterDataHandler(self, text):
304        self.stuff.append(text)
305
306    def StartElementHandler(self, name, attrs):
307        self.stuff.append("<%s>" % name)
308        bt = attrs.get("buffer-text")
309        if bt == "yes":
310            self.parser.buffer_text = 1
311        elif bt == "no":
312            self.parser.buffer_text = 0
313
314    def EndElementHandler(self, name):
315        self.stuff.append("</%s>" % name)
316
317    def CommentHandler(self, data):
318        self.stuff.append("<!--%s-->" % data)
319
320    def setHandlers(self, handlers=[]):
321        for name in handlers:
322            setattr(self.parser, name, getattr(self, name))
323
324    def test_default_to_disabled(self):
325        parser = expat.ParserCreate()
326        self.assertFalse(parser.buffer_text)
327
328    def test_buffering_enabled(self):
329        # Make sure buffering is turned on
330        self.assertTrue(self.parser.buffer_text)
331        self.parser.Parse("<a>1<b/>2<c/>3</a>", 1)
332        self.assertEquals(self.stuff, ['123'],
333                          "buffered text not properly collapsed")
334
335    def test1(self):
336        # XXX This test exposes more detail of Expat's text chunking than we
337        # XXX like, but it tests what we need to concisely.
338        self.setHandlers(["StartElementHandler"])
339        self.parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1)
340        self.assertEquals(self.stuff,
341                          ["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"],
342                          "buffering control not reacting as expected")
343
344    def test2(self):
345        self.parser.Parse("<a>1<b/>&lt;2&gt;<c/>&#32;\n&#x20;3</a>", 1)
346        self.assertEquals(self.stuff, ["1<2> \n 3"],
347                          "buffered text not properly collapsed")
348
349    def test3(self):
350        self.setHandlers(["StartElementHandler"])
351        self.parser.Parse("<a>1<b/>2<c/>3</a>", 1)
352        self.assertEquals(self.stuff, ["<a>", "1", "<b>", "2", "<c>", "3"],
353                          "buffered text not properly split")
354
355    def test4(self):
356        self.setHandlers(["StartElementHandler", "EndElementHandler"])
357        self.parser.CharacterDataHandler = None
358        self.parser.Parse("<a>1<b/>2<c/>3</a>", 1)
359        self.assertEquals(self.stuff,
360                          ["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"])
361
362    def test5(self):
363        self.setHandlers(["StartElementHandler", "EndElementHandler"])
364        self.parser.Parse("<a>1<b></b>2<c/>3</a>", 1)
365        self.assertEquals(self.stuff,
366            ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"])
367
368    def test6(self):
369        self.setHandlers(["CommentHandler", "EndElementHandler",
370                    "StartElementHandler"])
371        self.parser.Parse("<a>1<b/>2<c></c>345</a> ", 1)
372        self.assertEquals(self.stuff,
373            ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"],
374            "buffered text not properly split")
375
376    def test7(self):
377        self.setHandlers(["CommentHandler", "EndElementHandler",
378                    "StartElementHandler"])
379        self.parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1)
380        self.assertEquals(self.stuff,
381                          ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3",
382                           "<!--abc-->", "4", "<!--def-->", "5", "</a>"],
383                          "buffered text not properly split")
384
385
386# Test handling of exception from callback:
387class HandlerExceptionTest(unittest.TestCase):
388    def StartElementHandler(self, name, attrs):
389        raise RuntimeError(name)
390
391    def test(self):
392        parser = expat.ParserCreate()
393        parser.StartElementHandler = self.StartElementHandler
394        try:
395            parser.Parse("<a><b><c/></b></a>", 1)
396            self.fail()
397        except RuntimeError as e:
398            self.assertEquals(e.args[0], 'a',
399                              "Expected RuntimeError for element 'a', but" + \
400                              " found %r" % e.args[0])
401
402
403# Test Current* members:
404class PositionTest(unittest.TestCase):
405    def StartElementHandler(self, name, attrs):
406        self.check_pos('s')
407
408    def EndElementHandler(self, name):
409        self.check_pos('e')
410
411    def check_pos(self, event):
412        pos = (event,
413               self.parser.CurrentByteIndex,
414               self.parser.CurrentLineNumber,
415               self.parser.CurrentColumnNumber)
416        self.assertTrue(self.upto < len(self.expected_list),
417                        'too many parser events')
418        expected = self.expected_list[self.upto]
419        self.assertEquals(pos, expected,
420                'Expected position %s, got position %s' %(pos, expected))
421        self.upto += 1
422
423    def test(self):
424        self.parser = expat.ParserCreate()
425        self.parser.StartElementHandler = self.StartElementHandler
426        self.parser.EndElementHandler = self.EndElementHandler
427        self.upto = 0
428        self.expected_list = [('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2),
429                              ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)]
430
431        xml = '<a>\n <b>\n  <c/>\n </b>\n</a>'
432        self.parser.Parse(xml, 1)
433
434
435class sf1296433Test(unittest.TestCase):
436    def test_parse_only_xml_data(self):
437        # http://python.org/sf/1296433
438        #
439        xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * 1025)
440        # this one doesn't crash
441        #xml = "<?xml version='1.0'?><s>%s</s>" % ('a' * 10000)
442
443        class SpecificException(Exception):
444            pass
445
446        def handler(text):
447            raise SpecificException
448
449        parser = expat.ParserCreate()
450        parser.CharacterDataHandler = handler
451
452        self.assertRaises(Exception, parser.Parse, xml)
453
454class ChardataBufferTest(unittest.TestCase):
455    """
456    test setting of chardata buffer size
457    """
458
459    def test_1025_bytes(self):
460        self.assertEquals(self.small_buffer_test(1025), 2)
461
462    def test_1000_bytes(self):
463        self.assertEquals(self.small_buffer_test(1000), 1)
464
465    def test_wrong_size(self):
466        parser = expat.ParserCreate()
467        parser.buffer_text = 1
468        def f(size):
469            parser.buffer_size = size
470
471        self.assertRaises(ValueError, f, -1)
472        self.assertRaises(ValueError, f, 0)
473
474    def test_unchanged_size(self):
475        xml1 = ("<?xml version='1.0' encoding='iso8859'?><s>%s" % ('a' * 512))
476        xml2 = 'a'*512 + '</s>'
477        parser = expat.ParserCreate()
478        parser.CharacterDataHandler = self.counting_handler
479        parser.buffer_size = 512
480        parser.buffer_text = 1
481
482        # Feed 512 bytes of character data: the handler should be called
483        # once.
484        self.n = 0
485        parser.Parse(xml1)
486        self.assertEquals(self.n, 1)
487
488        # Reassign to buffer_size, but assign the same size.
489        parser.buffer_size = parser.buffer_size
490        self.assertEquals(self.n, 1)
491
492        # Try parsing rest of the document
493        parser.Parse(xml2)
494        self.assertEquals(self.n, 2)
495
496
497    def test_disabling_buffer(self):
498        xml1 = "<?xml version='1.0' encoding='iso8859'?><a>%s" % ('a' * 512)
499        xml2 = ('b' * 1024)
500        xml3 = "%s</a>" % ('c' * 1024)
501        parser = expat.ParserCreate()
502        parser.CharacterDataHandler = self.counting_handler
503        parser.buffer_text = 1
504        parser.buffer_size = 1024
505        self.assertEquals(parser.buffer_size, 1024)
506
507        # Parse one chunk of XML
508        self.n = 0
509        parser.Parse(xml1, 0)
510        self.assertEquals(parser.buffer_size, 1024)
511        self.assertEquals(self.n, 1)
512
513        # Turn off buffering and parse the next chunk.
514        parser.buffer_text = 0
515        self.assertFalse(parser.buffer_text)
516        self.assertEquals(parser.buffer_size, 1024)
517        for i in range(10):
518            parser.Parse(xml2, 0)
519        self.assertEquals(self.n, 11)
520
521        parser.buffer_text = 1
522        self.assertTrue(parser.buffer_text)
523        self.assertEquals(parser.buffer_size, 1024)
524        parser.Parse(xml3, 1)
525        self.assertEquals(self.n, 12)
526
527
528
529    def make_document(self, bytes):
530        return ("<?xml version='1.0'?><tag>" + bytes * 'a' + '</tag>')
531
532    def counting_handler(self, text):
533        self.n += 1
534
535    def small_buffer_test(self, buffer_len):
536        xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * buffer_len)
537        parser = expat.ParserCreate()
538        parser.CharacterDataHandler = self.counting_handler
539        parser.buffer_size = 1024
540        parser.buffer_text = 1
541
542        self.n = 0
543        parser.Parse(xml)
544        return self.n
545
546    def test_change_size_1(self):
547        xml1 = "<?xml version='1.0' encoding='iso8859'?><a><s>%s" % ('a' * 1024)
548        xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025)
549        parser = expat.ParserCreate()
550        parser.CharacterDataHandler = self.counting_handler
551        parser.buffer_text = 1
552        parser.buffer_size = 1024
553        self.assertEquals(parser.buffer_size, 1024)
554
555        self.n = 0
556        parser.Parse(xml1, 0)
557        parser.buffer_size *= 2
558        self.assertEquals(parser.buffer_size, 2048)
559        parser.Parse(xml2, 1)
560        self.assertEquals(self.n, 2)
561
562    def test_change_size_2(self):
563        xml1 = "<?xml version='1.0' encoding='iso8859'?><a>a<s>%s" % ('a' * 1023)
564        xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025)
565        parser = expat.ParserCreate()
566        parser.CharacterDataHandler = self.counting_handler
567        parser.buffer_text = 1
568        parser.buffer_size = 2048
569        self.assertEquals(parser.buffer_size, 2048)
570
571        self.n=0
572        parser.Parse(xml1, 0)
573        parser.buffer_size = parser.buffer_size // 2
574        self.assertEquals(parser.buffer_size, 1024)
575        parser.Parse(xml2, 1)
576        self.assertEquals(self.n, 4)
577
578class MalformedInputText(unittest.TestCase):
579    def test1(self):
580        xml = "\0\r\n"
581        parser = expat.ParserCreate()
582        try:
583            parser.Parse(xml, True)
584            self.fail()
585        except expat.ExpatError as e:
586            self.assertEquals(str(e), 'unclosed token: line 2, column 0')
587
588    def test2(self):
589        xml = "<?xml version\xc2\x85='1.0'?>\r\n"
590        parser = expat.ParserCreate()
591        try:
592            parser.Parse(xml, True)
593            self.fail()
594        except expat.ExpatError as e:
595            self.assertEquals(str(e), 'XML declaration not well-formed: line 1, column 14')
596
597def test_main():
598    run_unittest(SetAttributeTest,
599                 ParseTest,
600                 NamespaceSeparatorTest,
601                 InterningTest,
602                 BufferTextTest,
603                 HandlerExceptionTest,
604                 PositionTest,
605                 sf1296433Test,
606                 ChardataBufferTest,
607                 MalformedInputText)
608
609if __name__ == "__main__":
610    test_main()
611