libxml.py revision 37e63946dc2745dc1e546ac273e43512fb4685fc
1import libxml2mod 2import types 3 4# 5# Errors raised by the wrappers when some tree handling failed. 6# 7class treeError: 8 def __init__(self, msg): 9 self.msg = msg 10 def __str__(self): 11 return self.msg 12 13class parserError: 14 def __init__(self, msg): 15 self.msg = msg 16 def __str__(self): 17 return self.msg 18 19class uriError: 20 def __init__(self, msg): 21 self.msg = msg 22 def __str__(self): 23 return self.msg 24 25class xpathError: 26 def __init__(self, msg): 27 self.msg = msg 28 def __str__(self): 29 return self.msg 30 31class ioWrapper: 32 def __init__(self, _obj): 33 self.__io = _obj 34 self._o = None 35 36 def io_close(self): 37 if self.__io == None: 38 return(-1) 39 self.__io.close() 40 self.__io = None 41 return(0) 42 43 def io_flush(self): 44 if self.__io == None: 45 return(-1) 46 self.__io.flush() 47 return(0) 48 49 def io_read(self, len = -1): 50 if self.__io == None: 51 return(-1) 52 if len < 0: 53 return(self.__io.read()) 54 return(self.__io.read(len)) 55 56 def io_write(self, str, len = -1): 57 if self.__io == None: 58 return(-1) 59 if len < 0: 60 return(self.__io.write(str)) 61 return(self.__io.write(str, len)) 62 63class ioReadWrapper(ioWrapper): 64 def __init__(self, _obj, enc = ""): 65 ioWrapper.__init__(self, _obj) 66 self._o = libxml2mod.xmlCreateInputBuffer(self, enc) 67 68 def __del__(self): 69 print "__del__" 70 self.io_close() 71 if self._o != None: 72 libxml2mod.xmlFreeParserInputBuffer(self._o) 73 self._o = None 74 75 def close(self): 76 self.io_close() 77 if self._o != None: 78 libxml2mod.xmlFreeParserInputBuffer(self._o) 79 self._o = None 80 81class ioWriteWrapper(ioWrapper): 82 def __init__(self, _obj, enc = ""): 83# print "ioWriteWrapper.__init__", _obj 84 if type(_obj) == type(''): 85 print "write io from a string" 86 self.o = None 87 elif type(_obj) == types.InstanceType: 88 print "write io from instance of %s" % (_obj.__class__) 89 ioWrapper.__init__(self, _obj) 90 self._o = libxml2mod.xmlCreateOutputBuffer(self, enc) 91 else: 92 file = libxml2mod.outputBufferGetPythonFile(_obj) 93 if file != None: 94 ioWrapper.__init__(self, file) 95 else: 96 ioWrapper.__init__(self, _obj) 97 self._o = _obj 98 99 def __del__(self): 100# print "__del__" 101 self.io_close() 102 if self._o != None: 103 libxml2mod.xmlOutputBufferClose(self._o) 104 self._o = None 105 106 def flush(self): 107 self.io_flush() 108 if self._o != None: 109 libxml2mod.xmlOutputBufferClose(self._o) 110 self._o = None 111 112 def close(self): 113 self.io_flush() 114 if self._o != None: 115 libxml2mod.xmlOutputBufferClose(self._o) 116 self._o = None 117 118# 119# Example of a class to handle SAX events 120# 121class SAXCallback: 122 """Base class for SAX handlers""" 123 def startDocument(self): 124 """called at the start of the document""" 125 pass 126 127 def endDocument(self): 128 """called at the end of the document""" 129 pass 130 131 def startElement(self, tag, attrs): 132 """called at the start of every element, tag is the name of 133 the element, attrs is a dictionary of the element's attributes""" 134 pass 135 136 def endElement(self, tag): 137 """called at the start of every element, tag is the name of 138 the element""" 139 pass 140 141 def characters(self, data): 142 """called when character data have been read, data is the string 143 containing the data, multiple consecutive characters() callback 144 are possible.""" 145 pass 146 147 def cdataBlock(self, data): 148 """called when CDATA section have been read, data is the string 149 containing the data, multiple consecutive cdataBlock() callback 150 are possible.""" 151 pass 152 153 def reference(self, name): 154 """called when an entity reference has been found""" 155 pass 156 157 def ignorableWhitespace(self, data): 158 """called when potentially ignorable white spaces have been found""" 159 pass 160 161 def processingInstruction(self, target, data): 162 """called when a PI has been found, target contains the PI name and 163 data is the associated data in the PI""" 164 pass 165 166 def comment(self, content): 167 """called when a comment has been found, content contains the comment""" 168 pass 169 170 def externalSubset(self, name, externalID, systemID): 171 """called when a DOCTYPE declaration has been found, name is the 172 DTD name and externalID, systemID are the DTD public and system 173 identifier for that DTd if available""" 174 pass 175 176 def internalSubset(self, name, externalID, systemID): 177 """called when a DOCTYPE declaration has been found, name is the 178 DTD name and externalID, systemID are the DTD public and system 179 identifier for that DTD if available""" 180 pass 181 182 def entityDecl(self, name, type, externalID, systemID, content): 183 """called when an ENTITY declaration has been found, name is the 184 entity name and externalID, systemID are the entity public and 185 system identifier for that entity if available, type indicates 186 the entity type, and content reports it's string content""" 187 pass 188 189 def notationDecl(self, name, externalID, systemID): 190 """called when an NOTATION declaration has been found, name is the 191 notation name and externalID, systemID are the notation public and 192 system identifier for that notation if available""" 193 pass 194 195 def attributeDecl(self, elem, name, type, defi, defaultValue, nameList): 196 """called when an ATTRIBUTE definition has been found""" 197 pass 198 199 def elementDecl(self, name, type, content): 200 """called when an ELEMENT definition has been found""" 201 pass 202 203 def entityDecl(self, name, publicId, systemID, notationName): 204 """called when an unparsed ENTITY declaration has been found, 205 name is the entity name and publicId,, systemID are the entity 206 public and system identifier for that entity if available, 207 and notationName indicate the associated NOTATION""" 208 pass 209 210 def warning(self, msg): 211 print msg 212 213 def error(self, msg): 214 raise parserError(msg) 215 216 def fatalError(self, msg): 217 raise parserError(msg) 218 219# 220# This class is the ancestor of all the Node classes. It provides 221# the basic functionalities shared by all nodes (and handle 222# gracefylly the exception), like name, navigation in the tree, 223# doc reference, content access and serializing to a string or URI 224# 225class xmlCore: 226 def __init__(self, _obj=None): 227 if _obj != None: 228 self._o = _obj; 229 return 230 self._o = None 231 def get_parent(self): 232 ret = libxml2mod.parent(self._o) 233 if ret == None: 234 return None 235 return xmlNode(_obj=ret) 236 def get_children(self): 237 ret = libxml2mod.children(self._o) 238 if ret == None: 239 return None 240 return xmlNode(_obj=ret) 241 def get_last(self): 242 ret = libxml2mod.last(self._o) 243 if ret == None: 244 return None 245 return xmlNode(_obj=ret) 246 def get_next(self): 247 ret = libxml2mod.next(self._o) 248 if ret == None: 249 return None 250 return xmlNode(_obj=ret) 251 def get_properties(self): 252 ret = libxml2mod.properties(self._o) 253 if ret == None: 254 return None 255 return xmlAttr(_obj=ret) 256 def get_prev(self): 257 ret = libxml2mod.prev(self._o) 258 if ret == None: 259 return None 260 return xmlNode(_obj=ret) 261 def get_content(self): 262 return libxml2mod.xmlNodeGetContent(self._o) 263 getContent = get_content # why is this duplicate naming needed ? 264 def get_name(self): 265 return libxml2mod.name(self._o) 266 def get_type(self): 267 return libxml2mod.type(self._o) 268 def get_doc(self): 269 ret = libxml2mod.doc(self._o) 270 if ret == None: 271 if self.type in ["document_xml", "document_html"]: 272 return xmlDoc(_obj=self._o) 273 else: 274 return None 275 return xmlDoc(_obj=ret) 276 # 277 # Those are common attributes to nearly all type of nodes 278 # defined as python2 properties 279 # 280 import sys 281 if float(sys.version[0:3]) < 2.2: 282 def __getattr__(self, attr): 283 if attr == "parent": 284 ret = libxml2mod.parent(self._o) 285 if ret == None: 286 return None 287 return xmlNode(_obj=ret) 288 elif attr == "properties": 289 ret = libxml2mod.properties(self._o) 290 if ret == None: 291 return None 292 return xmlAttr(_obj=ret) 293 elif attr == "children": 294 ret = libxml2mod.children(self._o) 295 if ret == None: 296 return None 297 return xmlNode(_obj=ret) 298 elif attr == "last": 299 ret = libxml2mod.last(self._o) 300 if ret == None: 301 return None 302 return xmlNode(_obj=ret) 303 elif attr == "next": 304 ret = libxml2mod.next(self._o) 305 if ret == None: 306 return None 307 return xmlNode(_obj=ret) 308 elif attr == "prev": 309 ret = libxml2mod.prev(self._o) 310 if ret == None: 311 return None 312 return xmlNode(_obj=ret) 313 elif attr == "content": 314 return libxml2mod.xmlNodeGetContent(self._o) 315 elif attr == "name": 316 return libxml2mod.name(self._o) 317 elif attr == "type": 318 return libxml2mod.type(self._o) 319 elif attr == "doc": 320 ret = libxml2mod.doc(self._o) 321 if ret == None: 322 if self.type == "document_xml" or self.type == "document_html": 323 return xmlDoc(_obj=self._o) 324 else: 325 return None 326 return xmlDoc(_obj=ret) 327 raise AttributeError,attr 328 else: 329 parent = property(get_parent, None, None, "Parent node") 330 children = property(get_children, None, None, "First child node") 331 last = property(get_last, None, None, "Last sibling node") 332 next = property(get_next, None, None, "Next sibling node") 333 prev = property(get_prev, None, None, "Previous sibling node") 334 properties = property(get_properties, None, None, "List of properies") 335 content = property(get_content, None, None, "Content of this node") 336 name = property(get_name, None, None, "Node name") 337 type = property(get_type, None, None, "Node type") 338 doc = property(get_doc, None, None, "The document this node belongs to") 339 340 # 341 # Serialization routines, the optional arguments have the following 342 # meaning: 343 # encoding: string to ask saving in a specific encoding 344 # indent: if 1 the serializer is asked to indent the output 345 # 346 def serialize(self, encoding = None, format = 0): 347 return libxml2mod.serializeNode(self._o, encoding, format) 348 def saveTo(self, file, encoding = None, format = 0): 349 return libxml2mod.saveNodeTo(self._o, file, encoding, format) 350 351 # 352 # Canonicalization routines: 353 # 354 # nodes: the node set (tuple or list) to be included in the 355 # canonized image or None if all document nodes should be 356 # included. 357 # exclusive: the exclusive flag (0 - non-exclusive 358 # canonicalization; otherwise - exclusive canonicalization) 359 # prefixes: the list of inclusive namespace prefixes (strings), 360 # or None if there is no inclusive namespaces (only for 361 # exclusive canonicalization, ignored otherwise) 362 # with_comments: include comments in the result (!=0) or not 363 # (==0) 364 def c14nMemory(self, 365 nodes=None, 366 exclusive=0, 367 prefixes=None, 368 with_comments=0): 369 if nodes: 370 nodes = map(lambda n: n._o, nodes) 371 return libxml2mod.xmlC14NDocDumpMemory( 372 self.get_doc()._o, 373 nodes, 374 exclusive != 0, 375 prefixes, 376 with_comments != 0) 377 def c14nSaveTo(self, 378 file, 379 nodes=None, 380 exclusive=0, 381 prefixes=None, 382 with_comments=0): 383 if nodes: 384 nodes = map(lambda n: n._o, nodes) 385 return libxml2mod.xmlC14NDocSaveTo( 386 self.get_doc()._o, 387 nodes, 388 exclusive != 0, 389 prefixes, 390 with_comments != 0, 391 file) 392 393 # 394 # Selecting nodes using XPath, a bit slow because the context 395 # is allocated/freed every time but convenient. 396 # 397 def xpathEval(self, expr): 398 doc = self.doc 399 if doc == None: 400 return None 401 ctxt = doc.xpathNewContext() 402 ctxt.setContextNode(self) 403 res = ctxt.xpathEval(expr) 404 ctxt.xpathFreeContext() 405 return res 406 407# # 408# # Selecting nodes using XPath, faster because the context 409# # is allocated just once per xmlDoc. 410# # 411# # Removed: DV memleaks c.f. #126735 412# # 413# def xpathEval2(self, expr): 414# doc = self.doc 415# if doc == None: 416# return None 417# try: 418# doc._ctxt.setContextNode(self) 419# except: 420# doc._ctxt = doc.xpathNewContext() 421# doc._ctxt.setContextNode(self) 422# res = doc._ctxt.xpathEval(expr) 423# return res 424 def xpathEval2(self, expr): 425 return self.xpathEval(expr) 426 427 # support for python2 iterators 428 def walk_depth_first(self): 429 return xmlCoreDepthFirstItertor(self) 430 def walk_breadth_first(self): 431 return xmlCoreBreadthFirstItertor(self) 432 __iter__ = walk_depth_first 433 434 def free(self): 435 try: 436 self.doc._ctxt.xpathFreeContext() 437 except: 438 pass 439 libxml2mod.xmlFreeDoc(self._o) 440 441 442# 443# implements the depth-first iterator for libxml2 DOM tree 444# 445class xmlCoreDepthFirstItertor: 446 def __init__(self, node): 447 self.node = node 448 self.parents = [] 449 def __iter__(self): 450 return self 451 def next(self): 452 while 1: 453 if self.node: 454 ret = self.node 455 self.parents.append(self.node) 456 self.node = self.node.children 457 return ret 458 try: 459 parent = self.parents.pop() 460 except IndexError: 461 raise StopIteration 462 self.node = parent.next 463 464# 465# implements the breadth-first iterator for libxml2 DOM tree 466# 467class xmlCoreBreadthFirstItertor: 468 def __init__(self, node): 469 self.node = node 470 self.parents = [] 471 def __iter__(self): 472 return self 473 def next(self): 474 while 1: 475 if self.node: 476 ret = self.node 477 self.parents.append(self.node) 478 self.node = self.node.next 479 return ret 480 try: 481 parent = self.parents.pop() 482 except IndexError: 483 raise StopIteration 484 self.node = parent.children 485 486# 487# converters to present a nicer view of the XPath returns 488# 489def nodeWrap(o): 490 # TODO try to cast to the most appropriate node class 491 name = libxml2mod.name(o) 492 if name == "element" or name == "text": 493 return xmlNode(_obj=o) 494 if name == "attribute": 495 return xmlAttr(_obj=o) 496 if name[0:8] == "document": 497 return xmlDoc(_obj=o) 498 if name[0:8] == "namespace": 499 return xmlNs(_obj=o) 500 if name == "elem_decl": 501 return xmlElement(_obj=o) 502 if name == "attribute_decl": 503 return xmlAtribute(_obj=o) 504 if name == "entity_decl": 505 return xmlEntity(_obj=o) 506 if name == "dtd": 507 return xmlDtd(_obj=o) 508 return xmlNode(_obj=o) 509 510def xpathObjectRet(o): 511 if type(o) == type([]) or type(o) == type(()): 512 ret = map(lambda x: nodeWrap(x), o) 513 return ret 514 return o 515 516# 517# register an XPath function 518# 519def registerXPathFunction(ctxt, name, ns_uri, f): 520 ret = libxml2mod.xmlRegisterXPathFunction(ctxt, name, ns_uri, f) 521 522# 523# For the xmlTextReader parser configuration 524# 525PARSER_LOADDTD=1 526PARSER_DEFAULTATTRS=2 527PARSER_VALIDATE=3 528PARSER_SUBST_ENTITIES=4 529 530# 531# For the error callback severities 532# 533PARSER_SEVERITY_VALIDITY_WARNING=1 534PARSER_SEVERITY_VALIDITY_ERROR=2 535PARSER_SEVERITY_WARNING=3 536PARSER_SEVERITY_ERROR=4 537 538# 539# register the libxml2 error handler 540# 541def registerErrorHandler(f, ctx): 542 """Register a Python written function to for error reporting. 543 The function is called back as f(ctx, error). """ 544 import sys 545 if not sys.modules.has_key('libxslt'): 546 # normal behaviour when libxslt is not imported 547 ret = libxml2mod.xmlRegisterErrorHandler(f,ctx) 548 else: 549 # when libxslt is already imported, one must 550 # use libxst's error handler instead 551 import libxslt 552 ret = libxslt.registerErrorHandler(f,ctx) 553 return ret 554 555class parserCtxtCore: 556 557 def __init__(self, _obj=None): 558 if _obj != None: 559 self._o = _obj; 560 return 561 self._o = None 562 563 def __del__(self): 564 if self._o != None: 565 libxml2mod.xmlFreeParserCtxt(self._o) 566 self._o = None 567 568 def setErrorHandler(self,f,arg): 569 """Register an error handler that will be called back as 570 f(arg,msg,severity,reserved). 571 572 @reserved is currently always None.""" 573 libxml2mod.xmlParserCtxtSetErrorHandler(self._o,f,arg) 574 575 def getErrorHandler(self): 576 """Return (f,arg) as previously registered with setErrorHandler 577 or (None,None).""" 578 return libxml2mod.xmlParserCtxtGetErrorHandler(self._o) 579 580 def addLocalCatalog(self, uri): 581 """Register a local catalog with the parser""" 582 return libxml2mod.addLocalCatalog(self._o, uri) 583 584 585def _xmlTextReaderErrorFunc((f,arg),msg,severity,locator): 586 """Intermediate callback to wrap the locator""" 587 return f(arg,msg,severity,xmlTextReaderLocator(locator)) 588 589class xmlTextReaderCore: 590 591 def __init__(self, _obj=None): 592 self.input = None 593 if _obj != None:self._o = _obj;return 594 self._o = None 595 596 def __del__(self): 597 if self._o != None: 598 libxml2mod.xmlFreeTextReader(self._o) 599 self._o = None 600 601 def SetErrorHandler(self,f,arg): 602 """Register an error handler that will be called back as 603 f(arg,msg,severity,locator).""" 604 if f is None: 605 libxml2mod.xmlTextReaderSetErrorHandler(\ 606 self._o,None,None) 607 else: 608 libxml2mod.xmlTextReaderSetErrorHandler(\ 609 self._o,_xmlTextReaderErrorFunc,(f,arg)) 610 611 def GetErrorHandler(self): 612 """Return (f,arg) as previously registered with setErrorHandler 613 or (None,None).""" 614 f,arg = libxml2mod.xmlTextReaderGetErrorHandler(self._o) 615 if f is None: 616 return None,None 617 else: 618 # assert f is _xmlTextReaderErrorFunc 619 return arg 620 621# 622# The cleanup now goes though a wrappe in libxml.c 623# 624def cleanupParser(): 625 libxml2mod.xmlPythonCleanupParser() 626 627# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 628# 629# Everything before this line comes from libxml.py 630# Everything after this line is automatically generated 631# 632# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 633 634