test_pyexpat.py revision 3c9e6e9375954e9911ceddb65a98c3e58986e9a7
1# XXX TypeErrors on calling handlers, or on bad return values from a 2# handler, are obscure and unhelpful. 3 4from io import BytesIO 5import sys 6import unittest 7 8from xml.parsers import expat 9 10from test.support import sortdict, run_unittest 11 12 13class SetAttributeTest(unittest.TestCase): 14 def setUp(self): 15 self.parser = expat.ParserCreate(namespace_separator='!') 16 self.set_get_pairs = [ 17 [0, 0], 18 [1, 1], 19 [2, 1], 20 [0, 0], 21 ] 22 23 def test_ordered_attributes(self): 24 for x, y in self.set_get_pairs: 25 self.parser.ordered_attributes = x 26 self.assertEquals(self.parser.ordered_attributes, y) 27 28 def test_specified_attributes(self): 29 for x, y in self.set_get_pairs: 30 self.parser.specified_attributes = x 31 self.assertEquals(self.parser.specified_attributes, y) 32 33 34data = b'''\ 35<?xml version="1.0" encoding="iso-8859-1" standalone="no"?> 36<?xml-stylesheet href="stylesheet.css"?> 37<!-- comment data --> 38<!DOCTYPE quotations SYSTEM "quotations.dtd" [ 39<!ELEMENT root ANY> 40<!NOTATION notation SYSTEM "notation.jpeg"> 41<!ENTITY acirc "â"> 42<!ENTITY external_entity SYSTEM "entity.file"> 43<!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation> 44%unparsed_entity; 45]> 46 47<root attr1="value1" attr2="value2ὀ"> 48<myns:subelement xmlns:myns="http://www.python.org/namespace"> 49 Contents of subelements 50</myns:subelement> 51<sub2><![CDATA[contents of CDATA section]]></sub2> 52&external_entity; 53</root> 54''' 55 56 57# Produce UTF-8 output 58class ParseTest(unittest.TestCase): 59 class Outputter: 60 def __init__(self): 61 self.out = [] 62 63 def StartElementHandler(self, name, attrs): 64 self.out.append('Start element: ' + repr(name) + ' ' + 65 sortdict(attrs)) 66 67 def EndElementHandler(self, name): 68 self.out.append('End element: ' + repr(name)) 69 70 def CharacterDataHandler(self, data): 71 data = data.strip() 72 if data: 73 self.out.append('Character data: ' + repr(data)) 74 75 def ProcessingInstructionHandler(self, target, data): 76 self.out.append('PI: ' + repr(target) + ' ' + repr(data)) 77 78 def StartNamespaceDeclHandler(self, prefix, uri): 79 self.out.append('NS decl: ' + repr(prefix) + ' ' + repr(uri)) 80 81 def EndNamespaceDeclHandler(self, prefix): 82 self.out.append('End of NS decl: ' + repr(prefix)) 83 84 def StartCdataSectionHandler(self): 85 self.out.append('Start of CDATA section') 86 87 def EndCdataSectionHandler(self): 88 self.out.append('End of CDATA section') 89 90 def CommentHandler(self, text): 91 self.out.append('Comment: ' + repr(text)) 92 93 def NotationDeclHandler(self, *args): 94 name, base, sysid, pubid = args 95 self.out.append('Notation declared: %s' %(args,)) 96 97 def UnparsedEntityDeclHandler(self, *args): 98 entityName, base, systemId, publicId, notationName = args 99 self.out.append('Unparsed entity decl: %s' %(args,)) 100 101 def NotStandaloneHandler(self, userData): 102 self.out.append('Not standalone') 103 return 1 104 105 def ExternalEntityRefHandler(self, *args): 106 context, base, sysId, pubId = args 107 self.out.append('External entity ref: %s' %(args[1:],)) 108 return 1 109 110 def DefaultHandler(self, userData): 111 pass 112 113 def DefaultHandlerExpand(self, userData): 114 pass 115 116 handler_names = [ 117 'StartElementHandler', 'EndElementHandler', 118 'CharacterDataHandler', 'ProcessingInstructionHandler', 119 'UnparsedEntityDeclHandler', 'NotationDeclHandler', 120 'StartNamespaceDeclHandler', 'EndNamespaceDeclHandler', 121 'CommentHandler', 'StartCdataSectionHandler', 122 'EndCdataSectionHandler', 123 'DefaultHandler', 'DefaultHandlerExpand', 124 #'NotStandaloneHandler', 125 'ExternalEntityRefHandler' 126 ] 127 128 def _verify_parse_output(self, op): 129 self.assertEquals(op[0], 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'') 130 self.assertEquals(op[1], "Comment: ' comment data '") 131 self.assertEquals(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)") 132 self.assertEquals(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')") 133 self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\u1f40'}") 134 self.assertEquals(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'") 135 self.assertEquals(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}") 136 self.assertEquals(op[7], "Character data: 'Contents of subelements'") 137 self.assertEquals(op[8], "End element: 'http://www.python.org/namespace!subelement'") 138 self.assertEquals(op[9], "End of NS decl: 'myns'") 139 self.assertEquals(op[10], "Start element: 'sub2' {}") 140 self.assertEquals(op[11], 'Start of CDATA section') 141 self.assertEquals(op[12], "Character data: 'contents of CDATA section'") 142 self.assertEquals(op[13], 'End of CDATA section') 143 self.assertEquals(op[14], "End element: 'sub2'") 144 self.assertEquals(op[15], "External entity ref: (None, 'entity.file', None)") 145 self.assertEquals(op[16], "End element: 'root'") 146 147 148 def test_unicode(self): 149 # Try the parse again, this time producing Unicode output 150 out = self.Outputter() 151 parser = expat.ParserCreate(namespace_separator='!') 152 for name in self.handler_names: 153 setattr(parser, name, getattr(out, name)) 154 155 parser.Parse(data, 1) 156 157 op = out.out 158 self._verify_parse_output(op) 159 160 def test_parse_file(self): 161 # Try parsing a file 162 out = self.Outputter() 163 parser = expat.ParserCreate(namespace_separator='!') 164 for name in self.handler_names: 165 setattr(parser, name, getattr(out, name)) 166 file = BytesIO(data) 167 168 parser.ParseFile(file) 169 170 op = out.out 171 self._verify_parse_output(op) 172 173class NamespaceSeparatorTest(unittest.TestCase): 174 def test_legal(self): 175 # Tests that make sure we get errors when the namespace_separator value 176 # is illegal, and that we don't for good values: 177 expat.ParserCreate() 178 expat.ParserCreate(namespace_separator=None) 179 expat.ParserCreate(namespace_separator=' ') 180 181 def test_illegal(self): 182 try: 183 expat.ParserCreate(namespace_separator=42) 184 self.fail() 185 except TypeError as e: 186 self.assertEquals(str(e), 187 'ParserCreate() argument 2 must be str or None, not int') 188 189 try: 190 expat.ParserCreate(namespace_separator='too long') 191 self.fail() 192 except ValueError as e: 193 self.assertEquals(str(e), 194 'namespace_separator must be at most one character, omitted, or None') 195 196 def test_zero_length(self): 197 # ParserCreate() needs to accept a namespace_separator of zero length 198 # to satisfy the requirements of RDF applications that are required 199 # to simply glue together the namespace URI and the localname. Though 200 # considered a wart of the RDF specifications, it needs to be supported. 201 # 202 # See XML-SIG mailing list thread starting with 203 # http://mail.python.org/pipermail/xml-sig/2001-April/005202.html 204 # 205 expat.ParserCreate(namespace_separator='') # too short 206 207 208class InterningTest(unittest.TestCase): 209 def test(self): 210 # Test the interning machinery. 211 p = expat.ParserCreate() 212 L = [] 213 def collector(name, *args): 214 L.append(name) 215 p.StartElementHandler = collector 216 p.EndElementHandler = collector 217 p.Parse("<e> <e/> <e></e> </e>", 1) 218 tag = L[0] 219 self.assertEquals(len(L), 6) 220 for entry in L: 221 # L should have the same string repeated over and over. 222 self.assertTrue(tag is entry) 223 224 225class BufferTextTest(unittest.TestCase): 226 def setUp(self): 227 self.stuff = [] 228 self.parser = expat.ParserCreate() 229 self.parser.buffer_text = 1 230 self.parser.CharacterDataHandler = self.CharacterDataHandler 231 232 def check(self, expected, label): 233 self.assertEquals(self.stuff, expected, 234 "%s\nstuff = %r\nexpected = %r" 235 % (label, self.stuff, map(str, expected))) 236 237 def CharacterDataHandler(self, text): 238 self.stuff.append(text) 239 240 def StartElementHandler(self, name, attrs): 241 self.stuff.append("<%s>" % name) 242 bt = attrs.get("buffer-text") 243 if bt == "yes": 244 self.parser.buffer_text = 1 245 elif bt == "no": 246 self.parser.buffer_text = 0 247 248 def EndElementHandler(self, name): 249 self.stuff.append("</%s>" % name) 250 251 def CommentHandler(self, data): 252 self.stuff.append("<!--%s-->" % data) 253 254 def setHandlers(self, handlers=[]): 255 for name in handlers: 256 setattr(self.parser, name, getattr(self, name)) 257 258 def test_default_to_disabled(self): 259 parser = expat.ParserCreate() 260 self.assertFalse(parser.buffer_text) 261 262 def test_buffering_enabled(self): 263 # Make sure buffering is turned on 264 self.assertTrue(self.parser.buffer_text) 265 self.parser.Parse("<a>1<b/>2<c/>3</a>", 1) 266 self.assertEquals(self.stuff, ['123'], 267 "buffered text not properly collapsed") 268 269 def test1(self): 270 # XXX This test exposes more detail of Expat's text chunking than we 271 # XXX like, but it tests what we need to concisely. 272 self.setHandlers(["StartElementHandler"]) 273 self.parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1) 274 self.assertEquals(self.stuff, 275 ["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"], 276 "buffering control not reacting as expected") 277 278 def test2(self): 279 self.parser.Parse("<a>1<b/><2><c/> \n 3</a>", 1) 280 self.assertEquals(self.stuff, ["1<2> \n 3"], 281 "buffered text not properly collapsed") 282 283 def test3(self): 284 self.setHandlers(["StartElementHandler"]) 285 self.parser.Parse("<a>1<b/>2<c/>3</a>", 1) 286 self.assertEquals(self.stuff, ["<a>", "1", "<b>", "2", "<c>", "3"], 287 "buffered text not properly split") 288 289 def test4(self): 290 self.setHandlers(["StartElementHandler", "EndElementHandler"]) 291 self.parser.CharacterDataHandler = None 292 self.parser.Parse("<a>1<b/>2<c/>3</a>", 1) 293 self.assertEquals(self.stuff, 294 ["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"]) 295 296 def test5(self): 297 self.setHandlers(["StartElementHandler", "EndElementHandler"]) 298 self.parser.Parse("<a>1<b></b>2<c/>3</a>", 1) 299 self.assertEquals(self.stuff, 300 ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"]) 301 302 def test6(self): 303 self.setHandlers(["CommentHandler", "EndElementHandler", 304 "StartElementHandler"]) 305 self.parser.Parse("<a>1<b/>2<c></c>345</a> ", 1) 306 self.assertEquals(self.stuff, 307 ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"], 308 "buffered text not properly split") 309 310 def test7(self): 311 self.setHandlers(["CommentHandler", "EndElementHandler", 312 "StartElementHandler"]) 313 self.parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1) 314 self.assertEquals(self.stuff, 315 ["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", 316 "<!--abc-->", "4", "<!--def-->", "5", "</a>"], 317 "buffered text not properly split") 318 319 320# Test handling of exception from callback: 321class HandlerExceptionTest(unittest.TestCase): 322 def StartElementHandler(self, name, attrs): 323 raise RuntimeError(name) 324 325 def test(self): 326 parser = expat.ParserCreate() 327 parser.StartElementHandler = self.StartElementHandler 328 try: 329 parser.Parse("<a><b><c/></b></a>", 1) 330 self.fail() 331 except RuntimeError as e: 332 self.assertEquals(e.args[0], 'a', 333 "Expected RuntimeError for element 'a', but" + \ 334 " found %r" % e.args[0]) 335 336 337# Test Current* members: 338class PositionTest(unittest.TestCase): 339 def StartElementHandler(self, name, attrs): 340 self.check_pos('s') 341 342 def EndElementHandler(self, name): 343 self.check_pos('e') 344 345 def check_pos(self, event): 346 pos = (event, 347 self.parser.CurrentByteIndex, 348 self.parser.CurrentLineNumber, 349 self.parser.CurrentColumnNumber) 350 self.assertTrue(self.upto < len(self.expected_list), 351 'too many parser events') 352 expected = self.expected_list[self.upto] 353 self.assertEquals(pos, expected, 354 'Expected position %s, got position %s' %(pos, expected)) 355 self.upto += 1 356 357 def test(self): 358 self.parser = expat.ParserCreate() 359 self.parser.StartElementHandler = self.StartElementHandler 360 self.parser.EndElementHandler = self.EndElementHandler 361 self.upto = 0 362 self.expected_list = [('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2), 363 ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)] 364 365 xml = '<a>\n <b>\n <c/>\n </b>\n</a>' 366 self.parser.Parse(xml, 1) 367 368 369class sf1296433Test(unittest.TestCase): 370 def test_parse_only_xml_data(self): 371 # http://python.org/sf/1296433 372 # 373 xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * 1025) 374 # this one doesn't crash 375 #xml = "<?xml version='1.0'?><s>%s</s>" % ('a' * 10000) 376 377 class SpecificException(Exception): 378 pass 379 380 def handler(text): 381 raise SpecificException 382 383 parser = expat.ParserCreate() 384 parser.CharacterDataHandler = handler 385 386 self.assertRaises(Exception, parser.Parse, xml) 387 388class ChardataBufferTest(unittest.TestCase): 389 """ 390 test setting of chardata buffer size 391 """ 392 393 def test_1025_bytes(self): 394 self.assertEquals(self.small_buffer_test(1025), 2) 395 396 def test_1000_bytes(self): 397 self.assertEquals(self.small_buffer_test(1000), 1) 398 399 def test_wrong_size(self): 400 parser = expat.ParserCreate() 401 parser.buffer_text = 1 402 def f(size): 403 parser.buffer_size = size 404 405 self.assertRaises(ValueError, f, -1) 406 self.assertRaises(ValueError, f, 0) 407 408 def test_unchanged_size(self): 409 xml1 = ("<?xml version='1.0' encoding='iso8859'?><s>%s" % ('a' * 512)) 410 xml2 = 'a'*512 + '</s>' 411 parser = expat.ParserCreate() 412 parser.CharacterDataHandler = self.counting_handler 413 parser.buffer_size = 512 414 parser.buffer_text = 1 415 416 # Feed 512 bytes of character data: the handler should be called 417 # once. 418 self.n = 0 419 parser.Parse(xml1) 420 self.assertEquals(self.n, 1) 421 422 # Reassign to buffer_size, but assign the same size. 423 parser.buffer_size = parser.buffer_size 424 self.assertEquals(self.n, 1) 425 426 # Try parsing rest of the document 427 parser.Parse(xml2) 428 self.assertEquals(self.n, 2) 429 430 431 def test_disabling_buffer(self): 432 xml1 = "<?xml version='1.0' encoding='iso8859'?><a>%s" % ('a' * 512) 433 xml2 = ('b' * 1024) 434 xml3 = "%s</a>" % ('c' * 1024) 435 parser = expat.ParserCreate() 436 parser.CharacterDataHandler = self.counting_handler 437 parser.buffer_text = 1 438 parser.buffer_size = 1024 439 self.assertEquals(parser.buffer_size, 1024) 440 441 # Parse one chunk of XML 442 self.n = 0 443 parser.Parse(xml1, 0) 444 self.assertEquals(parser.buffer_size, 1024) 445 self.assertEquals(self.n, 1) 446 447 # Turn off buffering and parse the next chunk. 448 parser.buffer_text = 0 449 self.assertFalse(parser.buffer_text) 450 self.assertEquals(parser.buffer_size, 1024) 451 for i in range(10): 452 parser.Parse(xml2, 0) 453 self.assertEquals(self.n, 11) 454 455 parser.buffer_text = 1 456 self.assertTrue(parser.buffer_text) 457 self.assertEquals(parser.buffer_size, 1024) 458 parser.Parse(xml3, 1) 459 self.assertEquals(self.n, 12) 460 461 462 463 def make_document(self, bytes): 464 return ("<?xml version='1.0'?><tag>" + bytes * 'a' + '</tag>') 465 466 def counting_handler(self, text): 467 self.n += 1 468 469 def small_buffer_test(self, buffer_len): 470 xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * buffer_len) 471 parser = expat.ParserCreate() 472 parser.CharacterDataHandler = self.counting_handler 473 parser.buffer_size = 1024 474 parser.buffer_text = 1 475 476 self.n = 0 477 parser.Parse(xml) 478 return self.n 479 480 def test_change_size_1(self): 481 xml1 = "<?xml version='1.0' encoding='iso8859'?><a><s>%s" % ('a' * 1024) 482 xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025) 483 parser = expat.ParserCreate() 484 parser.CharacterDataHandler = self.counting_handler 485 parser.buffer_text = 1 486 parser.buffer_size = 1024 487 self.assertEquals(parser.buffer_size, 1024) 488 489 self.n = 0 490 parser.Parse(xml1, 0) 491 parser.buffer_size *= 2 492 self.assertEquals(parser.buffer_size, 2048) 493 parser.Parse(xml2, 1) 494 self.assertEquals(self.n, 2) 495 496 def test_change_size_2(self): 497 xml1 = "<?xml version='1.0' encoding='iso8859'?><a>a<s>%s" % ('a' * 1023) 498 xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025) 499 parser = expat.ParserCreate() 500 parser.CharacterDataHandler = self.counting_handler 501 parser.buffer_text = 1 502 parser.buffer_size = 2048 503 self.assertEquals(parser.buffer_size, 2048) 504 505 self.n=0 506 parser.Parse(xml1, 0) 507 parser.buffer_size = parser.buffer_size // 2 508 self.assertEquals(parser.buffer_size, 1024) 509 parser.Parse(xml2, 1) 510 self.assertEquals(self.n, 4) 511 512class MalformedInputText(unittest.TestCase): 513 def test1(self): 514 xml = "\0\r\n" 515 parser = expat.ParserCreate() 516 try: 517 parser.Parse(xml, True) 518 self.fail() 519 except expat.ExpatError as e: 520 self.assertEquals(str(e), 'unclosed token: line 2, column 0') 521 522 def test2(self): 523 xml = "<?xml version\xc2\x85='1.0'?>\r\n" 524 parser = expat.ParserCreate() 525 try: 526 parser.Parse(xml, True) 527 self.fail() 528 except expat.ExpatError as e: 529 self.assertEquals(str(e), 'XML declaration not well-formed: line 1, column 14') 530 531def test_main(): 532 run_unittest(SetAttributeTest, 533 ParseTest, 534 NamespaceSeparatorTest, 535 InterningTest, 536 BufferTextTest, 537 HandlerExceptionTest, 538 PositionTest, 539 sf1296433Test, 540 ChardataBufferTest, 541 MalformedInputText) 542 543if __name__ == "__main__": 544 test_main() 545