test_pyexpat.py revision b44155483515123a320ae57ec4b7ab237f991362
1# XXX TypeErrors on calling handlers, or on bad return values from a 2# handler, are obscure and unhelpful. 3 4from io import BytesIO 5import sys 6import unittest 7 8from xml.parsers import expat 9 10from test.support import sortdict, run_unittest 11 12 13class SetAttributeTest(unittest.TestCase): 14 def setUp(self): 15 self.parser = expat.ParserCreate(namespace_separator='!') 16 self.set_get_pairs = [ 17 [0, 0], 18 [1, 1], 19 [2, 1], 20 [0, 0], 21 ] 22 23 def test_ordered_attributes(self): 24 for x, y in self.set_get_pairs: 25 self.parser.ordered_attributes = x 26 self.assertEquals(self.parser.ordered_attributes, y) 27 28 def test_specified_attributes(self): 29 for x, y in self.set_get_pairs: 30 self.parser.specified_attributes = x 31 self.assertEquals(self.parser.specified_attributes, y) 32 33 34data = b'''\ 35<?xml version="1.0" encoding="iso-8859-1" standalone="no"?> 36<?xml-stylesheet href="stylesheet.css"?> 37<!-- comment data --> 38<!DOCTYPE quotations SYSTEM "quotations.dtd" [ 39<!ELEMENT root ANY> 40<!ATTLIST root attr1 CDATA #REQUIRED attr2 CDATA #IMPLIED> 41<!NOTATION notation SYSTEM "notation.jpeg"> 42<!ENTITY acirc "â"> 43<!ENTITY external_entity SYSTEM "entity.file"> 44<!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation> 45%unparsed_entity; 46]> 47 48<root attr1="value1" attr2="value2ὀ"> 49<myns:subelement xmlns:myns="http://www.python.org/namespace"> 50 Contents of subelements 51</myns:subelement> 52<sub2><![CDATA[contents of CDATA section]]></sub2> 53&external_entity; 54&skipped_entity; 55</root> 56''' 57 58 59# Produce UTF-8 output 60class ParseTest(unittest.TestCase): 61 class Outputter: 62 def __init__(self): 63 self.out = [] 64 65 def StartElementHandler(self, name, attrs): 66 self.out.append('Start element: ' + repr(name) + ' ' + 67 sortdict(attrs)) 68 69 def EndElementHandler(self, name): 70 self.out.append('End element: ' + repr(name)) 71 72 def CharacterDataHandler(self, data): 73 data = data.strip() 74 if data: 75 self.out.append('Character data: ' + repr(data)) 76 77 def ProcessingInstructionHandler(self, target, data): 78 self.out.append('PI: ' + repr(target) + ' ' + repr(data)) 79 80 def StartNamespaceDeclHandler(self, prefix, uri): 81 self.out.append('NS decl: ' + repr(prefix) + ' ' + repr(uri)) 82 83 def EndNamespaceDeclHandler(self, prefix): 84 self.out.append('End of NS decl: ' + repr(prefix)) 85 86 def StartCdataSectionHandler(self): 87 self.out.append('Start of CDATA section') 88 89 def EndCdataSectionHandler(self): 90 self.out.append('End of CDATA section') 91 92 def CommentHandler(self, text): 93 self.out.append('Comment: ' + repr(text)) 94 95 def NotationDeclHandler(self, *args): 96 name, base, sysid, pubid = args 97 self.out.append('Notation declared: %s' %(args,)) 98 99 def UnparsedEntityDeclHandler(self, *args): 100 entityName, base, systemId, publicId, notationName = args 101 self.out.append('Unparsed entity decl: %s' %(args,)) 102 103 def NotStandaloneHandler(self): 104 self.out.append('Not standalone') 105 return 1 106 107 def ExternalEntityRefHandler(self, *args): 108 context, base, sysId, pubId = args 109 self.out.append('External entity ref: %s' %(args[1:],)) 110 return 1 111 112 def StartDoctypeDeclHandler(self, *args): 113 self.out.append(('Start doctype', args)) 114 return 1 115 116 def EndDoctypeDeclHandler(self): 117 self.out.append("End doctype") 118 return 1 119 120 def EntityDeclHandler(self, *args): 121 self.out.append(('Entity declaration', args)) 122 return 1 123 124 def XmlDeclHandler(self, *args): 125 self.out.append(('XML declaration', args)) 126 return 1 127 128 def ElementDeclHandler(self, *args): 129 self.out.append(('Element declaration', args)) 130 return 1 131 132 def AttlistDeclHandler(self, *args): 133 self.out.append(('Attribute list declaration', args)) 134 return 1 135 136 def SkippedEntityHandler(self, *args): 137 self.out.append(("Skipped entity", args)) 138 return 1 139 140 def DefaultHandler(self, userData): 141 pass 142 143 def DefaultHandlerExpand(self, userData): 144 pass 145 146 handler_names = [ 147 'StartElementHandler', 'EndElementHandler', 'CharacterDataHandler', 148 'ProcessingInstructionHandler', 'UnparsedEntityDeclHandler', 149 'NotationDeclHandler', 'StartNamespaceDeclHandler', 150 'EndNamespaceDeclHandler', 'CommentHandler', 151 'StartCdataSectionHandler', 'EndCdataSectionHandler', 'DefaultHandler', 152 'DefaultHandlerExpand', 'NotStandaloneHandler', 153 'ExternalEntityRefHandler', 'StartDoctypeDeclHandler', 154 'EndDoctypeDeclHandler', 'EntityDeclHandler', 'XmlDeclHandler', 155 'ElementDeclHandler', 'AttlistDeclHandler', 'SkippedEntityHandler', 156 ] 157 158 def _verify_parse_output(self, operations): 159 expected_operations = [ 160 ('XML declaration', ('1.0', 'iso-8859-1', 0)), 161 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'', 162 "Comment: ' comment data '", 163 "Not standalone", 164 ("Start doctype", ('quotations', 'quotations.dtd', None, 1)), 165 ('Element declaration', ('root', (2, 0, None, ()))), 166 ('Attribute list declaration', ('root', 'attr1', 'CDATA', None, 167 1)), 168 ('Attribute list declaration', ('root', 'attr2', 'CDATA', None, 169 0)), 170 "Notation declared: ('notation', None, 'notation.jpeg', None)", 171 ('Entity declaration', ('acirc', 0, '\xe2', None, None, None, None)), 172 ('Entity declaration', ('external_entity', 0, None, None, 173 'entity.file', None, None)), 174 "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')", 175 "Not standalone", 176 "End doctype", 177 "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\u1f40'}", 178 "NS decl: 'myns' 'http://www.python.org/namespace'", 179 "Start element: 'http://www.python.org/namespace!subelement' {}", 180 "Character data: 'Contents of subelements'", 181 "End element: 'http://www.python.org/namespace!subelement'", 182 "End of NS decl: 'myns'", 183 "Start element: 'sub2' {}", 184 'Start of CDATA section', 185 "Character data: 'contents of CDATA section'", 186 'End of CDATA section', 187 "End element: 'sub2'", 188 "External entity ref: (None, 'entity.file', None)", 189 ('Skipped entity', ('skipped_entity', 0)), 190 "End element: 'root'", 191 ] 192 for operation, expected_operation in zip(operations, expected_operations): 193 self.assertEquals(operation, expected_operation) 194 195 def test_unicode(self): 196 # Try the parse again, this time producing Unicode output 197 out = self.Outputter() 198 parser = expat.ParserCreate(namespace_separator='!') 199 for name in self.handler_names: 200 setattr(parser, name, getattr(out, name)) 201 202 parser.Parse(data, 1) 203 204 operations = out.out 205 self._verify_parse_output(operations) 206 207 def test_parse_file(self): 208 # Try parsing a file 209 out = self.Outputter() 210 parser = expat.ParserCreate(namespace_separator='!') 211 for name in self.handler_names: 212 setattr(parser, name, getattr(out, name)) 213 file = BytesIO(data) 214 215 parser.ParseFile(file) 216 217 operations = out.out 218 self._verify_parse_output(operations) 219 220class NamespaceSeparatorTest(unittest.TestCase): 221 def test_legal(self): 222 # Tests that make sure we get errors when the namespace_separator value 223 # is illegal, and that we don't for good values: 224 expat.ParserCreate() 225 expat.ParserCreate(namespace_separator=None) 226 expat.ParserCreate(namespace_separator=' ') 227 228 def test_illegal(self): 229 try: 230 expat.ParserCreate(namespace_separator=42) 231 self.fail() 232 except TypeError as e: 233 self.assertEquals(str(e), 234 'ParserCreate() argument 2 must be str or None, not int') 235 236 try: 237 expat.ParserCreate(namespace_separator='too long') 238 self.fail() 239 except ValueError as e: 240 self.assertEquals(str(e), 241 'namespace_separator must be at most one character, omitted, or None') 242 243 def test_zero_length(self): 244 # ParserCreate() needs to accept a namespace_separator of zero length 245 # to satisfy the requirements of RDF applications that are required 246 # to simply glue together the namespace URI and the localname. Though 247 # considered a wart of the RDF specifications, it needs to be supported. 248 # 249 # See XML-SIG mailing list thread starting with 250 # http://mail.python.org/pipermail/xml-sig/2001-April/005202.html 251 # 252 expat.ParserCreate(namespace_separator='') # too short 253 254 255class InterningTest(unittest.TestCase): 256 def test(self): 257 # Test the interning machinery. 258 p = expat.ParserCreate() 259 L = [] 260 def collector(name, *args): 261 L.append(name) 262 p.StartElementHandler = collector 263 p.EndElementHandler = collector 264 p.Parse("<e> <e/> <e></e> </e>", 1) 265 tag = L[0] 266 self.assertEquals(len(L), 6) 267 for entry in L: 268 # L should have the same string repeated over and over. 269 self.assertTrue(tag is entry) 270 271 def test_issue9402(self): 272 # create an ExternalEntityParserCreate with buffer text 273 class ExternalOutputter: 274 def __init__(self, parser): 275 self.parser = parser 276 self.parser_result = None 277 278 def ExternalEntityRefHandler(self, context, base, sysId, pubId): 279 external_parser = self.parser.ExternalEntityParserCreate("") 280 self.parser_result = external_parser.Parse("", 1) 281 return 1 282 283 parser = expat.ParserCreate(namespace_separator='!') 284 parser.buffer_text = 1 285 out = ExternalOutputter(parser) 286 parser.ExternalEntityRefHandler = out.ExternalEntityRefHandler 287 parser.Parse(data, 1) 288 self.assertEquals(out.parser_result, 1) 289 290 291class BufferTextTest(unittest.TestCase): 292 def setUp(self): 293 self.stuff = [] 294 self.parser = expat.ParserCreate() 295 self.parser.buffer_text = 1 296 self.parser.CharacterDataHandler = self.CharacterDataHandler 297 298 def check(self, expected, label): 299 self.assertEquals(self.stuff, expected, 300 "%s\nstuff = %r\nexpected = %r" 301 % (label, self.stuff, map(str, expected))) 302 303 def CharacterDataHandler(self, text): 304 self.stuff.append(text) 305 306 def StartElementHandler(self, name, attrs): 307 self.stuff.append("<%s>" % name) 308 bt = attrs.get("buffer-text") 309 if bt == "yes": 310 self.parser.buffer_text = 1 311 elif bt == "no": 312 self.parser.buffer_text = 0 313 314 def EndElementHandler(self, name): 315 self.stuff.append("</%s>" % name) 316 317 def CommentHandler(self, data): 318 self.stuff.append("<!--%s-->" % data) 319 320 def setHandlers(self, handlers=[]): 321 for name in handlers: 322 setattr(self.parser, name, getattr(self, name)) 323 324 def test_default_to_disabled(self): 325 parser = expat.ParserCreate() 326 self.assertFalse(parser.buffer_text) 327 328 def test_buffering_enabled(self): 329 # Make sure buffering is turned on 330 self.assertTrue(self.parser.buffer_text) 331 self.parser.Parse("<a>1<b/>2<c/>3</a>", 1) 332 self.assertEquals(self.stuff, ['123'], 333 "buffered text not properly collapsed") 334 335 def test1(self): 336 # XXX This test exposes more detail of Expat's text chunking than we 337 # XXX like, but it tests what we need to concisely. 338 self.setHandlers(["StartElementHandler"]) 339 self.parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1) 340 self.assertEquals(self.stuff, 341 ["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"], 342 "buffering control not reacting as expected") 343 344 def test2(self): 345 self.parser.Parse("<a>1<b/><2><c/> \n 3</a>", 1) 346 self.assertEquals(self.stuff, ["1<2> \n 3"], 347 "buffered text not properly collapsed") 348 349 def test3(self): 350 self.setHandlers(["StartElementHandler"]) 351 self.parser.Parse("<a>1<b/>2<c/>3</a>", 1) 352 self.assertEquals(self.stuff, ["<a>", "1", "<b>", "2", "<c>", "3"], 353 "buffered text not properly split") 354 355 def test4(self): 356 self.setHandlers(["StartElementHandler", "EndElementHandler"]) 357 self.parser.CharacterDataHandler = None 358 self.parser.Parse("<a>1<b/>2<c/>3</a>", 1) 359 self.assertEquals(self.stuff, 360 ["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"]) 361 362 def test5(self): 363 self.setHandlers(["StartElementHandler", "EndElementHandler"]) 364 self.parser.Parse("<a>1<b></b>2<c/>3</a>", 1) 365 self.assertEquals(self.stuff, 366 ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"]) 367 368 def test6(self): 369 self.setHandlers(["CommentHandler", "EndElementHandler", 370 "StartElementHandler"]) 371 self.parser.Parse("<a>1<b/>2<c></c>345</a> ", 1) 372 self.assertEquals(self.stuff, 373 ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"], 374 "buffered text not properly split") 375 376 def test7(self): 377 self.setHandlers(["CommentHandler", "EndElementHandler", 378 "StartElementHandler"]) 379 self.parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1) 380 self.assertEquals(self.stuff, 381 ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", 382 "<!--abc-->", "4", "<!--def-->", "5", "</a>"], 383 "buffered text not properly split") 384 385 386# Test handling of exception from callback: 387class HandlerExceptionTest(unittest.TestCase): 388 def StartElementHandler(self, name, attrs): 389 raise RuntimeError(name) 390 391 def test(self): 392 parser = expat.ParserCreate() 393 parser.StartElementHandler = self.StartElementHandler 394 try: 395 parser.Parse("<a><b><c/></b></a>", 1) 396 self.fail() 397 except RuntimeError as e: 398 self.assertEquals(e.args[0], 'a', 399 "Expected RuntimeError for element 'a', but" + \ 400 " found %r" % e.args[0]) 401 402 403# Test Current* members: 404class PositionTest(unittest.TestCase): 405 def StartElementHandler(self, name, attrs): 406 self.check_pos('s') 407 408 def EndElementHandler(self, name): 409 self.check_pos('e') 410 411 def check_pos(self, event): 412 pos = (event, 413 self.parser.CurrentByteIndex, 414 self.parser.CurrentLineNumber, 415 self.parser.CurrentColumnNumber) 416 self.assertTrue(self.upto < len(self.expected_list), 417 'too many parser events') 418 expected = self.expected_list[self.upto] 419 self.assertEquals(pos, expected, 420 'Expected position %s, got position %s' %(pos, expected)) 421 self.upto += 1 422 423 def test(self): 424 self.parser = expat.ParserCreate() 425 self.parser.StartElementHandler = self.StartElementHandler 426 self.parser.EndElementHandler = self.EndElementHandler 427 self.upto = 0 428 self.expected_list = [('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2), 429 ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)] 430 431 xml = '<a>\n <b>\n <c/>\n </b>\n</a>' 432 self.parser.Parse(xml, 1) 433 434 435class sf1296433Test(unittest.TestCase): 436 def test_parse_only_xml_data(self): 437 # http://python.org/sf/1296433 438 # 439 xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * 1025) 440 # this one doesn't crash 441 #xml = "<?xml version='1.0'?><s>%s</s>" % ('a' * 10000) 442 443 class SpecificException(Exception): 444 pass 445 446 def handler(text): 447 raise SpecificException 448 449 parser = expat.ParserCreate() 450 parser.CharacterDataHandler = handler 451 452 self.assertRaises(Exception, parser.Parse, xml) 453 454class ChardataBufferTest(unittest.TestCase): 455 """ 456 test setting of chardata buffer size 457 """ 458 459 def test_1025_bytes(self): 460 self.assertEquals(self.small_buffer_test(1025), 2) 461 462 def test_1000_bytes(self): 463 self.assertEquals(self.small_buffer_test(1000), 1) 464 465 def test_wrong_size(self): 466 parser = expat.ParserCreate() 467 parser.buffer_text = 1 468 def f(size): 469 parser.buffer_size = size 470 471 self.assertRaises(ValueError, f, -1) 472 self.assertRaises(ValueError, f, 0) 473 474 def test_unchanged_size(self): 475 xml1 = ("<?xml version='1.0' encoding='iso8859'?><s>%s" % ('a' * 512)) 476 xml2 = 'a'*512 + '</s>' 477 parser = expat.ParserCreate() 478 parser.CharacterDataHandler = self.counting_handler 479 parser.buffer_size = 512 480 parser.buffer_text = 1 481 482 # Feed 512 bytes of character data: the handler should be called 483 # once. 484 self.n = 0 485 parser.Parse(xml1) 486 self.assertEquals(self.n, 1) 487 488 # Reassign to buffer_size, but assign the same size. 489 parser.buffer_size = parser.buffer_size 490 self.assertEquals(self.n, 1) 491 492 # Try parsing rest of the document 493 parser.Parse(xml2) 494 self.assertEquals(self.n, 2) 495 496 497 def test_disabling_buffer(self): 498 xml1 = "<?xml version='1.0' encoding='iso8859'?><a>%s" % ('a' * 512) 499 xml2 = ('b' * 1024) 500 xml3 = "%s</a>" % ('c' * 1024) 501 parser = expat.ParserCreate() 502 parser.CharacterDataHandler = self.counting_handler 503 parser.buffer_text = 1 504 parser.buffer_size = 1024 505 self.assertEquals(parser.buffer_size, 1024) 506 507 # Parse one chunk of XML 508 self.n = 0 509 parser.Parse(xml1, 0) 510 self.assertEquals(parser.buffer_size, 1024) 511 self.assertEquals(self.n, 1) 512 513 # Turn off buffering and parse the next chunk. 514 parser.buffer_text = 0 515 self.assertFalse(parser.buffer_text) 516 self.assertEquals(parser.buffer_size, 1024) 517 for i in range(10): 518 parser.Parse(xml2, 0) 519 self.assertEquals(self.n, 11) 520 521 parser.buffer_text = 1 522 self.assertTrue(parser.buffer_text) 523 self.assertEquals(parser.buffer_size, 1024) 524 parser.Parse(xml3, 1) 525 self.assertEquals(self.n, 12) 526 527 528 529 def make_document(self, bytes): 530 return ("<?xml version='1.0'?><tag>" + bytes * 'a' + '</tag>') 531 532 def counting_handler(self, text): 533 self.n += 1 534 535 def small_buffer_test(self, buffer_len): 536 xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * buffer_len) 537 parser = expat.ParserCreate() 538 parser.CharacterDataHandler = self.counting_handler 539 parser.buffer_size = 1024 540 parser.buffer_text = 1 541 542 self.n = 0 543 parser.Parse(xml) 544 return self.n 545 546 def test_change_size_1(self): 547 xml1 = "<?xml version='1.0' encoding='iso8859'?><a><s>%s" % ('a' * 1024) 548 xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025) 549 parser = expat.ParserCreate() 550 parser.CharacterDataHandler = self.counting_handler 551 parser.buffer_text = 1 552 parser.buffer_size = 1024 553 self.assertEquals(parser.buffer_size, 1024) 554 555 self.n = 0 556 parser.Parse(xml1, 0) 557 parser.buffer_size *= 2 558 self.assertEquals(parser.buffer_size, 2048) 559 parser.Parse(xml2, 1) 560 self.assertEquals(self.n, 2) 561 562 def test_change_size_2(self): 563 xml1 = "<?xml version='1.0' encoding='iso8859'?><a>a<s>%s" % ('a' * 1023) 564 xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025) 565 parser = expat.ParserCreate() 566 parser.CharacterDataHandler = self.counting_handler 567 parser.buffer_text = 1 568 parser.buffer_size = 2048 569 self.assertEquals(parser.buffer_size, 2048) 570 571 self.n=0 572 parser.Parse(xml1, 0) 573 parser.buffer_size = parser.buffer_size // 2 574 self.assertEquals(parser.buffer_size, 1024) 575 parser.Parse(xml2, 1) 576 self.assertEquals(self.n, 4) 577 578class MalformedInputText(unittest.TestCase): 579 def test1(self): 580 xml = "\0\r\n" 581 parser = expat.ParserCreate() 582 try: 583 parser.Parse(xml, True) 584 self.fail() 585 except expat.ExpatError as e: 586 self.assertEquals(str(e), 'unclosed token: line 2, column 0') 587 588 def test2(self): 589 xml = "<?xml version\xc2\x85='1.0'?>\r\n" 590 parser = expat.ParserCreate() 591 try: 592 parser.Parse(xml, True) 593 self.fail() 594 except expat.ExpatError as e: 595 self.assertEquals(str(e), 'XML declaration not well-formed: line 1, column 14') 596 597def test_main(): 598 run_unittest(SetAttributeTest, 599 ParseTest, 600 NamespaceSeparatorTest, 601 InterningTest, 602 BufferTextTest, 603 HandlerExceptionTest, 604 PositionTest, 605 sf1296433Test, 606 ChardataBufferTest, 607 MalformedInputText) 608 609if __name__ == "__main__": 610 test_main() 611