libxml.py revision 87f3287d9b00bdc39be032a8a3f2d46860eaf787
1import libxml2mod 2import types 3import sys 4 5# The root of all libxml2 errors. 6class libxmlError(Exception): pass 7 8# Type of the wrapper class for the C objects wrappers 9def checkWrapper(obj): 10 try: 11 n = type(_obj).__name__ 12 if n != 'PyCObject' and n != 'PyCapsule': 13 return 1 14 except: 15 return 0 16 return 0 17 18# 19# id() is sometimes negative ... 20# 21def pos_id(o): 22 i = id(o) 23 if (i < 0): 24 return (sys.maxsize - i) 25 return i 26 27# 28# Errors raised by the wrappers when some tree handling failed. 29# 30class treeError(libxmlError): 31 def __init__(self, msg): 32 self.msg = msg 33 def __str__(self): 34 return self.msg 35 36class parserError(libxmlError): 37 def __init__(self, msg): 38 self.msg = msg 39 def __str__(self): 40 return self.msg 41 42class uriError(libxmlError): 43 def __init__(self, msg): 44 self.msg = msg 45 def __str__(self): 46 return self.msg 47 48class xpathError(libxmlError): 49 def __init__(self, msg): 50 self.msg = msg 51 def __str__(self): 52 return self.msg 53 54class ioWrapper: 55 def __init__(self, _obj): 56 self.__io = _obj 57 self._o = None 58 59 def io_close(self): 60 if self.__io == None: 61 return(-1) 62 self.__io.close() 63 self.__io = None 64 return(0) 65 66 def io_flush(self): 67 if self.__io == None: 68 return(-1) 69 self.__io.flush() 70 return(0) 71 72 def io_read(self, len = -1): 73 if self.__io == None: 74 return(-1) 75 try: 76 if len < 0: 77 ret = self.__io.read() 78 else: 79 ret = self.__io.read(len) 80 except Exception as e: 81 print("failed to read from Python:", type(e)) 82 print("on IO:", self.__io) 83 self.__io == None 84 return(-1) 85 86 return(ret) 87 88 def io_write(self, str, len = -1): 89 if self.__io == None: 90 return(-1) 91 if len < 0: 92 return(self.__io.write(str)) 93 return(self.__io.write(str, len)) 94 95class ioReadWrapper(ioWrapper): 96 def __init__(self, _obj, enc = ""): 97 ioWrapper.__init__(self, _obj) 98 self._o = libxml2mod.xmlCreateInputBuffer(self, enc) 99 100 def __del__(self): 101 print("__del__") 102 self.io_close() 103 if self._o != None: 104 libxml2mod.xmlFreeParserInputBuffer(self._o) 105 self._o = None 106 107 def close(self): 108 self.io_close() 109 if self._o != None: 110 libxml2mod.xmlFreeParserInputBuffer(self._o) 111 self._o = None 112 113class ioWriteWrapper(ioWrapper): 114 def __init__(self, _obj, enc = ""): 115# print "ioWriteWrapper.__init__", _obj 116 if type(_obj) == type(''): 117 print("write io from a string") 118 self.o = None 119 elif type(_obj).__name__ == 'PyCapsule': 120 file = libxml2mod.outputBufferGetPythonFile(_obj) 121 if file != None: 122 ioWrapper.__init__(self, file) 123 else: 124 ioWrapper.__init__(self, _obj) 125 self._o = _obj 126# elif type(_obj) == types.InstanceType: 127# print(("write io from instance of %s" % (_obj.__class__))) 128# ioWrapper.__init__(self, _obj) 129# self._o = libxml2mod.xmlCreateOutputBuffer(self, enc) 130 else: 131 file = libxml2mod.outputBufferGetPythonFile(_obj) 132 if file != None: 133 ioWrapper.__init__(self, file) 134 else: 135 ioWrapper.__init__(self, _obj) 136 self._o = _obj 137 138 def __del__(self): 139# print "__del__" 140 self.io_close() 141 if self._o != None: 142 libxml2mod.xmlOutputBufferClose(self._o) 143 self._o = None 144 145 def flush(self): 146 self.io_flush() 147 if self._o != None: 148 libxml2mod.xmlOutputBufferClose(self._o) 149 self._o = None 150 151 def close(self): 152 self.io_flush() 153 if self._o != None: 154 libxml2mod.xmlOutputBufferClose(self._o) 155 self._o = None 156 157# 158# Example of a class to handle SAX events 159# 160class SAXCallback: 161 """Base class for SAX handlers""" 162 def startDocument(self): 163 """called at the start of the document""" 164 pass 165 166 def endDocument(self): 167 """called at the end of the document""" 168 pass 169 170 def startElement(self, tag, attrs): 171 """called at the start of every element, tag is the name of 172 the element, attrs is a dictionary of the element's attributes""" 173 pass 174 175 def endElement(self, tag): 176 """called at the start of every element, tag is the name of 177 the element""" 178 pass 179 180 def characters(self, data): 181 """called when character data have been read, data is the string 182 containing the data, multiple consecutive characters() callback 183 are possible.""" 184 pass 185 186 def cdataBlock(self, data): 187 """called when CDATA section have been read, data is the string 188 containing the data, multiple consecutive cdataBlock() callback 189 are possible.""" 190 pass 191 192 def reference(self, name): 193 """called when an entity reference has been found""" 194 pass 195 196 def ignorableWhitespace(self, data): 197 """called when potentially ignorable white spaces have been found""" 198 pass 199 200 def processingInstruction(self, target, data): 201 """called when a PI has been found, target contains the PI name and 202 data is the associated data in the PI""" 203 pass 204 205 def comment(self, content): 206 """called when a comment has been found, content contains the comment""" 207 pass 208 209 def externalSubset(self, name, externalID, systemID): 210 """called when a DOCTYPE declaration has been found, name is the 211 DTD name and externalID, systemID are the DTD public and system 212 identifier for that DTd if available""" 213 pass 214 215 def internalSubset(self, name, externalID, systemID): 216 """called when a DOCTYPE declaration has been found, name is the 217 DTD name and externalID, systemID are the DTD public and system 218 identifier for that DTD if available""" 219 pass 220 221 def entityDecl(self, name, type, externalID, systemID, content): 222 """called when an ENTITY declaration has been found, name is the 223 entity name and externalID, systemID are the entity public and 224 system identifier for that entity if available, type indicates 225 the entity type, and content reports it's string content""" 226 pass 227 228 def notationDecl(self, name, externalID, systemID): 229 """called when an NOTATION declaration has been found, name is the 230 notation name and externalID, systemID are the notation public and 231 system identifier for that notation if available""" 232 pass 233 234 def attributeDecl(self, elem, name, type, defi, defaultValue, nameList): 235 """called when an ATTRIBUTE definition has been found""" 236 pass 237 238 def elementDecl(self, name, type, content): 239 """called when an ELEMENT definition has been found""" 240 pass 241 242 def entityDecl(self, name, publicId, systemID, notationName): 243 """called when an unparsed ENTITY declaration has been found, 244 name is the entity name and publicId,, systemID are the entity 245 public and system identifier for that entity if available, 246 and notationName indicate the associated NOTATION""" 247 pass 248 249 def warning(self, msg): 250 #print msg 251 pass 252 253 def error(self, msg): 254 raise parserError(msg) 255 256 def fatalError(self, msg): 257 raise parserError(msg) 258 259# 260# This class is the ancestor of all the Node classes. It provides 261# the basic functionalities shared by all nodes (and handle 262# gracefylly the exception), like name, navigation in the tree, 263# doc reference, content access and serializing to a string or URI 264# 265class xmlCore: 266 def __init__(self, _obj=None): 267 if _obj != None: 268 self._o = _obj; 269 return 270 self._o = None 271 272 def __eq__(self, other): 273 if other == None: 274 return False 275 ret = libxml2mod.compareNodesEqual(self._o, other._o) 276 if ret == None: 277 return False 278 return ret == True 279 def __ne__(self, other): 280 if other == None: 281 return True 282 ret = libxml2mod.compareNodesEqual(self._o, other._o) 283 return not ret 284 def __hash__(self): 285 ret = libxml2mod.nodeHash(self._o) 286 return ret 287 288 def __str__(self): 289 return self.serialize() 290 def get_parent(self): 291 ret = libxml2mod.parent(self._o) 292 if ret == None: 293 return None 294 return nodeWrap(ret) 295 def get_children(self): 296 ret = libxml2mod.children(self._o) 297 if ret == None: 298 return None 299 return nodeWrap(ret) 300 def get_last(self): 301 ret = libxml2mod.last(self._o) 302 if ret == None: 303 return None 304 return nodeWrap(ret) 305 def get_next(self): 306 ret = libxml2mod.next(self._o) 307 if ret == None: 308 return None 309 return nodeWrap(ret) 310 def get_properties(self): 311 ret = libxml2mod.properties(self._o) 312 if ret == None: 313 return None 314 return xmlAttr(_obj=ret) 315 def get_prev(self): 316 ret = libxml2mod.prev(self._o) 317 if ret == None: 318 return None 319 return nodeWrap(ret) 320 def get_content(self): 321 return libxml2mod.xmlNodeGetContent(self._o) 322 getContent = get_content # why is this duplicate naming needed ? 323 def get_name(self): 324 return libxml2mod.name(self._o) 325 def get_type(self): 326 return libxml2mod.type(self._o) 327 def get_doc(self): 328 ret = libxml2mod.doc(self._o) 329 if ret == None: 330 if self.type in ["document_xml", "document_html"]: 331 return xmlDoc(_obj=self._o) 332 else: 333 return None 334 return xmlDoc(_obj=ret) 335 # 336 # Those are common attributes to nearly all type of nodes 337 # defined as python2 properties 338 # 339 import sys 340 if float(sys.version[0:3]) < 2.2: 341 def __getattr__(self, attr): 342 if attr == "parent": 343 ret = libxml2mod.parent(self._o) 344 if ret == None: 345 return None 346 return nodeWrap(ret) 347 elif attr == "properties": 348 ret = libxml2mod.properties(self._o) 349 if ret == None: 350 return None 351 return xmlAttr(_obj=ret) 352 elif attr == "children": 353 ret = libxml2mod.children(self._o) 354 if ret == None: 355 return None 356 return nodeWrap(ret) 357 elif attr == "last": 358 ret = libxml2mod.last(self._o) 359 if ret == None: 360 return None 361 return nodeWrap(ret) 362 elif attr == "next": 363 ret = libxml2mod.next(self._o) 364 if ret == None: 365 return None 366 return nodeWrap(ret) 367 elif attr == "prev": 368 ret = libxml2mod.prev(self._o) 369 if ret == None: 370 return None 371 return nodeWrap(ret) 372 elif attr == "content": 373 return libxml2mod.xmlNodeGetContent(self._o) 374 elif attr == "name": 375 return libxml2mod.name(self._o) 376 elif attr == "type": 377 return libxml2mod.type(self._o) 378 elif attr == "doc": 379 ret = libxml2mod.doc(self._o) 380 if ret == None: 381 if self.type == "document_xml" or self.type == "document_html": 382 return xmlDoc(_obj=self._o) 383 else: 384 return None 385 return xmlDoc(_obj=ret) 386 raise AttributeError(attr) 387 else: 388 parent = property(get_parent, None, None, "Parent node") 389 children = property(get_children, None, None, "First child node") 390 last = property(get_last, None, None, "Last sibling node") 391 next = property(get_next, None, None, "Next sibling node") 392 prev = property(get_prev, None, None, "Previous sibling node") 393 properties = property(get_properties, None, None, "List of properies") 394 content = property(get_content, None, None, "Content of this node") 395 name = property(get_name, None, None, "Node name") 396 type = property(get_type, None, None, "Node type") 397 doc = property(get_doc, None, None, "The document this node belongs to") 398 399 # 400 # Serialization routines, the optional arguments have the following 401 # meaning: 402 # encoding: string to ask saving in a specific encoding 403 # indent: if 1 the serializer is asked to indent the output 404 # 405 def serialize(self, encoding = None, format = 0): 406 return libxml2mod.serializeNode(self._o, encoding, format) 407 def saveTo(self, file, encoding = None, format = 0): 408 return libxml2mod.saveNodeTo(self._o, file, encoding, format) 409 410 # 411 # Canonicalization routines: 412 # 413 # nodes: the node set (tuple or list) to be included in the 414 # canonized image or None if all document nodes should be 415 # included. 416 # exclusive: the exclusive flag (0 - non-exclusive 417 # canonicalization; otherwise - exclusive canonicalization) 418 # prefixes: the list of inclusive namespace prefixes (strings), 419 # or None if there is no inclusive namespaces (only for 420 # exclusive canonicalization, ignored otherwise) 421 # with_comments: include comments in the result (!=0) or not 422 # (==0) 423 def c14nMemory(self, 424 nodes=None, 425 exclusive=0, 426 prefixes=None, 427 with_comments=0): 428 if nodes: 429 nodes = [n._o for n in nodes] 430 return libxml2mod.xmlC14NDocDumpMemory( 431 self.get_doc()._o, 432 nodes, 433 exclusive != 0, 434 prefixes, 435 with_comments != 0) 436 def c14nSaveTo(self, 437 file, 438 nodes=None, 439 exclusive=0, 440 prefixes=None, 441 with_comments=0): 442 if nodes: 443 nodes = [n._o for n in nodes] 444 return libxml2mod.xmlC14NDocSaveTo( 445 self.get_doc()._o, 446 nodes, 447 exclusive != 0, 448 prefixes, 449 with_comments != 0, 450 file) 451 452 # 453 # Selecting nodes using XPath, a bit slow because the context 454 # is allocated/freed every time but convenient. 455 # 456 def xpathEval(self, expr): 457 doc = self.doc 458 if doc == None: 459 return None 460 ctxt = doc.xpathNewContext() 461 ctxt.setContextNode(self) 462 res = ctxt.xpathEval(expr) 463 ctxt.xpathFreeContext() 464 return res 465 466# # 467# # Selecting nodes using XPath, faster because the context 468# # is allocated just once per xmlDoc. 469# # 470# # Removed: DV memleaks c.f. #126735 471# # 472# def xpathEval2(self, expr): 473# doc = self.doc 474# if doc == None: 475# return None 476# try: 477# doc._ctxt.setContextNode(self) 478# except: 479# doc._ctxt = doc.xpathNewContext() 480# doc._ctxt.setContextNode(self) 481# res = doc._ctxt.xpathEval(expr) 482# return res 483 def xpathEval2(self, expr): 484 return self.xpathEval(expr) 485 486 # Remove namespaces 487 def removeNsDef(self, href): 488 """ 489 Remove a namespace definition from a node. If href is None, 490 remove all of the ns definitions on that node. The removed 491 namespaces are returned as a linked list. 492 493 Note: If any child nodes referred to the removed namespaces, 494 they will be left with dangling links. You should call 495 renconciliateNs() to fix those pointers. 496 497 Note: This method does not free memory taken by the ns 498 definitions. You will need to free it manually with the 499 freeNsList() method on the returns xmlNs object. 500 """ 501 502 ret = libxml2mod.xmlNodeRemoveNsDef(self._o, href) 503 if ret is None:return None 504 __tmp = xmlNs(_obj=ret) 505 return __tmp 506 507 # support for python2 iterators 508 def walk_depth_first(self): 509 return xmlCoreDepthFirstItertor(self) 510 def walk_breadth_first(self): 511 return xmlCoreBreadthFirstItertor(self) 512 __iter__ = walk_depth_first 513 514 def free(self): 515 try: 516 self.doc._ctxt.xpathFreeContext() 517 except: 518 pass 519 libxml2mod.xmlFreeDoc(self._o) 520 521 522# 523# implements the depth-first iterator for libxml2 DOM tree 524# 525class xmlCoreDepthFirstItertor: 526 def __init__(self, node): 527 self.node = node 528 self.parents = [] 529 def __iter__(self): 530 return self 531 def next(self): 532 while 1: 533 if self.node: 534 ret = self.node 535 self.parents.append(self.node) 536 self.node = self.node.children 537 return ret 538 try: 539 parent = self.parents.pop() 540 except IndexError: 541 raise StopIteration 542 self.node = parent.next 543 544# 545# implements the breadth-first iterator for libxml2 DOM tree 546# 547class xmlCoreBreadthFirstItertor: 548 def __init__(self, node): 549 self.node = node 550 self.parents = [] 551 def __iter__(self): 552 return self 553 def next(self): 554 while 1: 555 if self.node: 556 ret = self.node 557 self.parents.append(self.node) 558 self.node = self.node.next 559 return ret 560 try: 561 parent = self.parents.pop() 562 except IndexError: 563 raise StopIteration 564 self.node = parent.children 565 566# 567# converters to present a nicer view of the XPath returns 568# 569def nodeWrap(o): 570 # TODO try to cast to the most appropriate node class 571 name = libxml2mod.type(o) 572 if name == "element" or name == "text": 573 return xmlNode(_obj=o) 574 if name == "attribute": 575 return xmlAttr(_obj=o) 576 if name[0:8] == "document": 577 return xmlDoc(_obj=o) 578 if name == "namespace": 579 return xmlNs(_obj=o) 580 if name == "elem_decl": 581 return xmlElement(_obj=o) 582 if name == "attribute_decl": 583 return xmlAttribute(_obj=o) 584 if name == "entity_decl": 585 return xmlEntity(_obj=o) 586 if name == "dtd": 587 return xmlDtd(_obj=o) 588 return xmlNode(_obj=o) 589 590def xpathObjectRet(o): 591 otype = type(o) 592 if otype == type([]): 593 ret = list(map(xpathObjectRet, o)) 594 return ret 595 elif otype == type(()): 596 ret = list(map(xpathObjectRet, o)) 597 return tuple(ret) 598 elif otype == type('') or otype == type(0) or otype == type(0.0): 599 return o 600 else: 601 return nodeWrap(o) 602 603# 604# register an XPath function 605# 606def registerXPathFunction(ctxt, name, ns_uri, f): 607 ret = libxml2mod.xmlRegisterXPathFunction(ctxt, name, ns_uri, f) 608 609# 610# For the xmlTextReader parser configuration 611# 612PARSER_LOADDTD=1 613PARSER_DEFAULTATTRS=2 614PARSER_VALIDATE=3 615PARSER_SUBST_ENTITIES=4 616 617# 618# For the error callback severities 619# 620PARSER_SEVERITY_VALIDITY_WARNING=1 621PARSER_SEVERITY_VALIDITY_ERROR=2 622PARSER_SEVERITY_WARNING=3 623PARSER_SEVERITY_ERROR=4 624 625# 626# register the libxml2 error handler 627# 628def registerErrorHandler(f, ctx): 629 """Register a Python written function to for error reporting. 630 The function is called back as f(ctx, error). """ 631 import sys 632 if 'libxslt' not in sys.modules: 633 # normal behaviour when libxslt is not imported 634 ret = libxml2mod.xmlRegisterErrorHandler(f,ctx) 635 else: 636 # when libxslt is already imported, one must 637 # use libxst's error handler instead 638 import libxslt 639 ret = libxslt.registerErrorHandler(f,ctx) 640 return ret 641 642class parserCtxtCore: 643 644 def __init__(self, _obj=None): 645 if _obj != None: 646 self._o = _obj; 647 return 648 self._o = None 649 650 def __del__(self): 651 if self._o != None: 652 libxml2mod.xmlFreeParserCtxt(self._o) 653 self._o = None 654 655 def setErrorHandler(self,f,arg): 656 """Register an error handler that will be called back as 657 f(arg,msg,severity,reserved). 658 659 @reserved is currently always None.""" 660 libxml2mod.xmlParserCtxtSetErrorHandler(self._o,f,arg) 661 662 def getErrorHandler(self): 663 """Return (f,arg) as previously registered with setErrorHandler 664 or (None,None).""" 665 return libxml2mod.xmlParserCtxtGetErrorHandler(self._o) 666 667 def addLocalCatalog(self, uri): 668 """Register a local catalog with the parser""" 669 return libxml2mod.addLocalCatalog(self._o, uri) 670 671 672class ValidCtxtCore: 673 674 def __init__(self, *args, **kw): 675 pass 676 677 def setValidityErrorHandler(self, err_func, warn_func, arg=None): 678 """ 679 Register error and warning handlers for DTD validation. 680 These will be called back as f(msg,arg) 681 """ 682 libxml2mod.xmlSetValidErrors(self._o, err_func, warn_func, arg) 683 684 685class SchemaValidCtxtCore: 686 687 def __init__(self, *args, **kw): 688 pass 689 690 def setValidityErrorHandler(self, err_func, warn_func, arg=None): 691 """ 692 Register error and warning handlers for Schema validation. 693 These will be called back as f(msg,arg) 694 """ 695 libxml2mod.xmlSchemaSetValidErrors(self._o, err_func, warn_func, arg) 696 697 698class relaxNgValidCtxtCore: 699 700 def __init__(self, *args, **kw): 701 pass 702 703 def setValidityErrorHandler(self, err_func, warn_func, arg=None): 704 """ 705 Register error and warning handlers for RelaxNG validation. 706 These will be called back as f(msg,arg) 707 """ 708 libxml2mod.xmlRelaxNGSetValidErrors(self._o, err_func, warn_func, arg) 709 710 711def _xmlTextReaderErrorFunc(xxx_todo_changeme,msg,severity,locator): 712 """Intermediate callback to wrap the locator""" 713 (f,arg) = xxx_todo_changeme 714 return f(arg,msg,severity,xmlTextReaderLocator(locator)) 715 716class xmlTextReaderCore: 717 718 def __init__(self, _obj=None): 719 self.input = None 720 if _obj != None:self._o = _obj;return 721 self._o = None 722 723 def __del__(self): 724 if self._o != None: 725 libxml2mod.xmlFreeTextReader(self._o) 726 self._o = None 727 728 def SetErrorHandler(self,f,arg): 729 """Register an error handler that will be called back as 730 f(arg,msg,severity,locator).""" 731 if f is None: 732 libxml2mod.xmlTextReaderSetErrorHandler(\ 733 self._o,None,None) 734 else: 735 libxml2mod.xmlTextReaderSetErrorHandler(\ 736 self._o,_xmlTextReaderErrorFunc,(f,arg)) 737 738 def GetErrorHandler(self): 739 """Return (f,arg) as previously registered with setErrorHandler 740 or (None,None).""" 741 f,arg = libxml2mod.xmlTextReaderGetErrorHandler(self._o) 742 if f is None: 743 return None,None 744 else: 745 # assert f is _xmlTextReaderErrorFunc 746 return arg 747 748# 749# The cleanup now goes though a wrapper in libxml.c 750# 751def cleanupParser(): 752 libxml2mod.xmlPythonCleanupParser() 753 754# 755# The interface to xmlRegisterInputCallbacks. 756# Since this API does not allow to pass a data object along with 757# match/open callbacks, it is necessary to maintain a list of all 758# Python callbacks. 759# 760__input_callbacks = [] 761def registerInputCallback(func): 762 def findOpenCallback(URI): 763 for cb in reversed(__input_callbacks): 764 o = cb(URI) 765 if o is not None: 766 return o 767 libxml2mod.xmlRegisterInputCallback(findOpenCallback) 768 __input_callbacks.append(func) 769 770def popInputCallbacks(): 771 # First pop python-level callbacks, when no more available - start 772 # popping built-in ones. 773 if len(__input_callbacks) > 0: 774 __input_callbacks.pop() 775 if len(__input_callbacks) == 0: 776 libxml2mod.xmlUnregisterInputCallback() 777 778# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 779# 780# Everything before this line comes from libxml.py 781# Everything after this line is automatically generated 782# 783# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 784 785