libxml.py revision 87f3287d9b00bdc39be032a8a3f2d46860eaf787
1import libxml2mod
2import types
3import sys
4
5# The root of all libxml2 errors.
6class libxmlError(Exception): pass
7
8# Type of the wrapper class for the C objects wrappers
9def checkWrapper(obj):
10    try:
11        n = type(_obj).__name__
12        if n != 'PyCObject' and n != 'PyCapsule':
13            return 1
14    except:
15        return 0
16    return 0
17
18#
19# id() is sometimes negative ...
20#
21def pos_id(o):
22    i = id(o)
23    if (i < 0):
24        return (sys.maxsize - i)
25    return i
26
27#
28# Errors raised by the wrappers when some tree handling failed.
29#
30class treeError(libxmlError):
31    def __init__(self, msg):
32        self.msg = msg
33    def __str__(self):
34        return self.msg
35
36class parserError(libxmlError):
37    def __init__(self, msg):
38        self.msg = msg
39    def __str__(self):
40        return self.msg
41
42class uriError(libxmlError):
43    def __init__(self, msg):
44        self.msg = msg
45    def __str__(self):
46        return self.msg
47
48class xpathError(libxmlError):
49    def __init__(self, msg):
50        self.msg = msg
51    def __str__(self):
52        return self.msg
53
54class ioWrapper:
55    def __init__(self, _obj):
56        self.__io = _obj
57        self._o = None
58
59    def io_close(self):
60        if self.__io == None:
61            return(-1)
62        self.__io.close()
63        self.__io = None
64        return(0)
65
66    def io_flush(self):
67        if self.__io == None:
68            return(-1)
69        self.__io.flush()
70        return(0)
71
72    def io_read(self, len = -1):
73        if self.__io == None:
74            return(-1)
75        try:
76            if len < 0:
77                ret = self.__io.read()
78            else:
79                ret = self.__io.read(len)
80        except Exception as e:
81            print("failed to read from Python:", type(e))
82            print("on IO:", self.__io)
83            self.__io == None
84            return(-1)
85
86        return(ret)
87
88    def io_write(self, str, len = -1):
89        if self.__io == None:
90            return(-1)
91        if len < 0:
92            return(self.__io.write(str))
93        return(self.__io.write(str, len))
94
95class ioReadWrapper(ioWrapper):
96    def __init__(self, _obj, enc = ""):
97        ioWrapper.__init__(self, _obj)
98        self._o = libxml2mod.xmlCreateInputBuffer(self, enc)
99
100    def __del__(self):
101        print("__del__")
102        self.io_close()
103        if self._o != None:
104            libxml2mod.xmlFreeParserInputBuffer(self._o)
105        self._o = None
106
107    def close(self):
108        self.io_close()
109        if self._o != None:
110            libxml2mod.xmlFreeParserInputBuffer(self._o)
111        self._o = None
112
113class ioWriteWrapper(ioWrapper):
114    def __init__(self, _obj, enc = ""):
115#        print "ioWriteWrapper.__init__", _obj
116        if type(_obj) == type(''):
117            print("write io from a string")
118            self.o = None
119        elif type(_obj).__name__ == 'PyCapsule':
120            file = libxml2mod.outputBufferGetPythonFile(_obj)
121            if file != None:
122                ioWrapper.__init__(self, file)
123            else:
124                ioWrapper.__init__(self, _obj)
125            self._o = _obj
126#        elif type(_obj) == types.InstanceType:
127#            print(("write io from instance of %s" % (_obj.__class__)))
128#            ioWrapper.__init__(self, _obj)
129#            self._o = libxml2mod.xmlCreateOutputBuffer(self, enc)
130        else:
131            file = libxml2mod.outputBufferGetPythonFile(_obj)
132            if file != None:
133                ioWrapper.__init__(self, file)
134            else:
135                ioWrapper.__init__(self, _obj)
136            self._o = _obj
137
138    def __del__(self):
139#        print "__del__"
140        self.io_close()
141        if self._o != None:
142            libxml2mod.xmlOutputBufferClose(self._o)
143        self._o = None
144
145    def flush(self):
146        self.io_flush()
147        if self._o != None:
148            libxml2mod.xmlOutputBufferClose(self._o)
149        self._o = None
150
151    def close(self):
152        self.io_flush()
153        if self._o != None:
154            libxml2mod.xmlOutputBufferClose(self._o)
155        self._o = None
156
157#
158# Example of a class to handle SAX events
159#
160class SAXCallback:
161    """Base class for SAX handlers"""
162    def startDocument(self):
163        """called at the start of the document"""
164        pass
165
166    def endDocument(self):
167        """called at the end of the document"""
168        pass
169
170    def startElement(self, tag, attrs):
171        """called at the start of every element, tag is the name of
172           the element, attrs is a dictionary of the element's attributes"""
173        pass
174
175    def endElement(self, tag):
176        """called at the start of every element, tag is the name of
177           the element"""
178        pass
179
180    def characters(self, data):
181        """called when character data have been read, data is the string
182           containing the data, multiple consecutive characters() callback
183           are possible."""
184        pass
185
186    def cdataBlock(self, data):
187        """called when CDATA section have been read, data is the string
188           containing the data, multiple consecutive cdataBlock() callback
189           are possible."""
190        pass
191
192    def reference(self, name):
193        """called when an entity reference has been found"""
194        pass
195
196    def ignorableWhitespace(self, data):
197        """called when potentially ignorable white spaces have been found"""
198        pass
199
200    def processingInstruction(self, target, data):
201        """called when a PI has been found, target contains the PI name and
202           data is the associated data in the PI"""
203        pass
204
205    def comment(self, content):
206        """called when a comment has been found, content contains the comment"""
207        pass
208
209    def externalSubset(self, name, externalID, systemID):
210        """called when a DOCTYPE declaration has been found, name is the
211           DTD name and externalID, systemID are the DTD public and system
212           identifier for that DTd if available"""
213        pass
214
215    def internalSubset(self, name, externalID, systemID):
216        """called when a DOCTYPE declaration has been found, name is the
217           DTD name and externalID, systemID are the DTD public and system
218           identifier for that DTD if available"""
219        pass
220
221    def entityDecl(self, name, type, externalID, systemID, content):
222        """called when an ENTITY declaration has been found, name is the
223           entity name and externalID, systemID are the entity public and
224           system identifier for that entity if available, type indicates
225           the entity type, and content reports it's string content"""
226        pass
227
228    def notationDecl(self, name, externalID, systemID):
229        """called when an NOTATION declaration has been found, name is the
230           notation name and externalID, systemID are the notation public and
231           system identifier for that notation if available"""
232        pass
233
234    def attributeDecl(self, elem, name, type, defi, defaultValue, nameList):
235        """called when an ATTRIBUTE definition has been found"""
236        pass
237
238    def elementDecl(self, name, type, content):
239        """called when an ELEMENT definition has been found"""
240        pass
241
242    def entityDecl(self, name, publicId, systemID, notationName):
243        """called when an unparsed ENTITY declaration has been found,
244           name is the entity name and publicId,, systemID are the entity
245           public and system identifier for that entity if available,
246           and notationName indicate the associated NOTATION"""
247        pass
248
249    def warning(self, msg):
250        #print msg
251        pass
252
253    def error(self, msg):
254        raise parserError(msg)
255
256    def fatalError(self, msg):
257        raise parserError(msg)
258
259#
260# This class is the ancestor of all the Node classes. It provides
261# the basic functionalities shared by all nodes (and handle
262# gracefylly the exception), like name, navigation in the tree,
263# doc reference, content access and serializing to a string or URI
264#
265class xmlCore:
266    def __init__(self, _obj=None):
267        if _obj != None:
268            self._o = _obj;
269            return
270        self._o = None
271
272    def __eq__(self, other):
273        if other == None:
274            return False
275        ret = libxml2mod.compareNodesEqual(self._o, other._o)
276        if ret == None:
277            return False
278        return ret == True
279    def __ne__(self, other):
280        if other == None:
281            return True
282        ret = libxml2mod.compareNodesEqual(self._o, other._o)
283        return not ret
284    def __hash__(self):
285        ret = libxml2mod.nodeHash(self._o)
286        return ret
287
288    def __str__(self):
289        return self.serialize()
290    def get_parent(self):
291        ret = libxml2mod.parent(self._o)
292        if ret == None:
293            return None
294        return nodeWrap(ret)
295    def get_children(self):
296        ret = libxml2mod.children(self._o)
297        if ret == None:
298            return None
299        return nodeWrap(ret)
300    def get_last(self):
301        ret = libxml2mod.last(self._o)
302        if ret == None:
303            return None
304        return nodeWrap(ret)
305    def get_next(self):
306        ret = libxml2mod.next(self._o)
307        if ret == None:
308            return None
309        return nodeWrap(ret)
310    def get_properties(self):
311        ret = libxml2mod.properties(self._o)
312        if ret == None:
313            return None
314        return xmlAttr(_obj=ret)
315    def get_prev(self):
316        ret = libxml2mod.prev(self._o)
317        if ret == None:
318            return None
319        return nodeWrap(ret)
320    def get_content(self):
321        return libxml2mod.xmlNodeGetContent(self._o)
322    getContent = get_content  # why is this duplicate naming needed ?
323    def get_name(self):
324        return libxml2mod.name(self._o)
325    def get_type(self):
326        return libxml2mod.type(self._o)
327    def get_doc(self):
328        ret = libxml2mod.doc(self._o)
329        if ret == None:
330            if self.type in ["document_xml", "document_html"]:
331                return xmlDoc(_obj=self._o)
332            else:
333                return None
334        return xmlDoc(_obj=ret)
335    #
336    # Those are common attributes to nearly all type of nodes
337    # defined as python2 properties
338    #
339    import sys
340    if float(sys.version[0:3]) < 2.2:
341        def __getattr__(self, attr):
342            if attr == "parent":
343                ret = libxml2mod.parent(self._o)
344                if ret == None:
345                    return None
346                return nodeWrap(ret)
347            elif attr == "properties":
348                ret = libxml2mod.properties(self._o)
349                if ret == None:
350                    return None
351                return xmlAttr(_obj=ret)
352            elif attr == "children":
353                ret = libxml2mod.children(self._o)
354                if ret == None:
355                    return None
356                return nodeWrap(ret)
357            elif attr == "last":
358                ret = libxml2mod.last(self._o)
359                if ret == None:
360                    return None
361                return nodeWrap(ret)
362            elif attr == "next":
363                ret = libxml2mod.next(self._o)
364                if ret == None:
365                    return None
366                return nodeWrap(ret)
367            elif attr == "prev":
368                ret = libxml2mod.prev(self._o)
369                if ret == None:
370                    return None
371                return nodeWrap(ret)
372            elif attr == "content":
373                return libxml2mod.xmlNodeGetContent(self._o)
374            elif attr == "name":
375                return libxml2mod.name(self._o)
376            elif attr == "type":
377                return libxml2mod.type(self._o)
378            elif attr == "doc":
379                ret = libxml2mod.doc(self._o)
380                if ret == None:
381                    if self.type == "document_xml" or self.type == "document_html":
382                        return xmlDoc(_obj=self._o)
383                    else:
384                        return None
385                return xmlDoc(_obj=ret)
386            raise AttributeError(attr)
387    else:
388        parent = property(get_parent, None, None, "Parent node")
389        children = property(get_children, None, None, "First child node")
390        last = property(get_last, None, None, "Last sibling node")
391        next = property(get_next, None, None, "Next sibling node")
392        prev = property(get_prev, None, None, "Previous sibling node")
393        properties = property(get_properties, None, None, "List of properies")
394        content = property(get_content, None, None, "Content of this node")
395        name = property(get_name, None, None, "Node name")
396        type = property(get_type, None, None, "Node type")
397        doc = property(get_doc, None, None, "The document this node belongs to")
398
399    #
400    # Serialization routines, the optional arguments have the following
401    # meaning:
402    #     encoding: string to ask saving in a specific encoding
403    #     indent: if 1 the serializer is asked to indent the output
404    #
405    def serialize(self, encoding = None, format = 0):
406        return libxml2mod.serializeNode(self._o, encoding, format)
407    def saveTo(self, file, encoding = None, format = 0):
408        return libxml2mod.saveNodeTo(self._o, file, encoding, format)
409
410    #
411    # Canonicalization routines:
412    #
413    #   nodes: the node set (tuple or list) to be included in the
414    #     canonized image or None if all document nodes should be
415    #     included.
416    #   exclusive: the exclusive flag (0 - non-exclusive
417    #     canonicalization; otherwise - exclusive canonicalization)
418    #   prefixes: the list of inclusive namespace prefixes (strings),
419    #     or None if there is no inclusive namespaces (only for
420    #     exclusive canonicalization, ignored otherwise)
421    #   with_comments: include comments in the result (!=0) or not
422    #     (==0)
423    def c14nMemory(self,
424                   nodes=None,
425                   exclusive=0,
426                   prefixes=None,
427                   with_comments=0):
428        if nodes:
429            nodes = [n._o for n in nodes]
430        return libxml2mod.xmlC14NDocDumpMemory(
431            self.get_doc()._o,
432            nodes,
433            exclusive != 0,
434            prefixes,
435            with_comments != 0)
436    def c14nSaveTo(self,
437                   file,
438                   nodes=None,
439                   exclusive=0,
440                   prefixes=None,
441                   with_comments=0):
442        if nodes:
443            nodes = [n._o for n in nodes]
444        return libxml2mod.xmlC14NDocSaveTo(
445            self.get_doc()._o,
446            nodes,
447            exclusive != 0,
448            prefixes,
449            with_comments != 0,
450            file)
451
452    #
453    # Selecting nodes using XPath, a bit slow because the context
454    # is allocated/freed every time but convenient.
455    #
456    def xpathEval(self, expr):
457        doc = self.doc
458        if doc == None:
459            return None
460        ctxt = doc.xpathNewContext()
461        ctxt.setContextNode(self)
462        res = ctxt.xpathEval(expr)
463        ctxt.xpathFreeContext()
464        return res
465
466#    #
467#    # Selecting nodes using XPath, faster because the context
468#    # is allocated just once per xmlDoc.
469#    #
470#    # Removed: DV memleaks c.f. #126735
471#    #
472#    def xpathEval2(self, expr):
473#        doc = self.doc
474#        if doc == None:
475#            return None
476#        try:
477#            doc._ctxt.setContextNode(self)
478#        except:
479#            doc._ctxt = doc.xpathNewContext()
480#            doc._ctxt.setContextNode(self)
481#        res = doc._ctxt.xpathEval(expr)
482#        return res
483    def xpathEval2(self, expr):
484        return self.xpathEval(expr)
485
486    # Remove namespaces
487    def removeNsDef(self, href):
488        """
489        Remove a namespace definition from a node.  If href is None,
490        remove all of the ns definitions on that node.  The removed
491        namespaces are returned as a linked list.
492
493        Note: If any child nodes referred to the removed namespaces,
494        they will be left with dangling links.  You should call
495        renconciliateNs() to fix those pointers.
496
497        Note: This method does not free memory taken by the ns
498        definitions.  You will need to free it manually with the
499        freeNsList() method on the returns xmlNs object.
500        """
501
502        ret = libxml2mod.xmlNodeRemoveNsDef(self._o, href)
503        if ret is None:return None
504        __tmp = xmlNs(_obj=ret)
505        return __tmp
506
507    # support for python2 iterators
508    def walk_depth_first(self):
509        return xmlCoreDepthFirstItertor(self)
510    def walk_breadth_first(self):
511        return xmlCoreBreadthFirstItertor(self)
512    __iter__ = walk_depth_first
513
514    def free(self):
515        try:
516            self.doc._ctxt.xpathFreeContext()
517        except:
518            pass
519        libxml2mod.xmlFreeDoc(self._o)
520
521
522#
523# implements the depth-first iterator for libxml2 DOM tree
524#
525class xmlCoreDepthFirstItertor:
526    def __init__(self, node):
527        self.node = node
528        self.parents = []
529    def __iter__(self):
530        return self
531    def next(self):
532        while 1:
533            if self.node:
534                ret = self.node
535                self.parents.append(self.node)
536                self.node = self.node.children
537                return ret
538            try:
539                parent = self.parents.pop()
540            except IndexError:
541                raise StopIteration
542            self.node = parent.next
543
544#
545# implements the breadth-first iterator for libxml2 DOM tree
546#
547class xmlCoreBreadthFirstItertor:
548    def __init__(self, node):
549        self.node = node
550        self.parents = []
551    def __iter__(self):
552        return self
553    def next(self):
554        while 1:
555            if self.node:
556                ret = self.node
557                self.parents.append(self.node)
558                self.node = self.node.next
559                return ret
560            try:
561                parent = self.parents.pop()
562            except IndexError:
563                raise StopIteration
564            self.node = parent.children
565
566#
567# converters to present a nicer view of the XPath returns
568#
569def nodeWrap(o):
570    # TODO try to cast to the most appropriate node class
571    name = libxml2mod.type(o)
572    if name == "element" or name == "text":
573        return xmlNode(_obj=o)
574    if name == "attribute":
575        return xmlAttr(_obj=o)
576    if name[0:8] == "document":
577        return xmlDoc(_obj=o)
578    if name == "namespace":
579        return xmlNs(_obj=o)
580    if name == "elem_decl":
581        return xmlElement(_obj=o)
582    if name == "attribute_decl":
583        return xmlAttribute(_obj=o)
584    if name == "entity_decl":
585        return xmlEntity(_obj=o)
586    if name == "dtd":
587        return xmlDtd(_obj=o)
588    return xmlNode(_obj=o)
589
590def xpathObjectRet(o):
591    otype = type(o)
592    if otype == type([]):
593        ret = list(map(xpathObjectRet, o))
594        return ret
595    elif otype == type(()):
596        ret = list(map(xpathObjectRet, o))
597        return tuple(ret)
598    elif otype == type('') or otype == type(0) or otype == type(0.0):
599        return o
600    else:
601        return nodeWrap(o)
602
603#
604# register an XPath function
605#
606def registerXPathFunction(ctxt, name, ns_uri, f):
607    ret = libxml2mod.xmlRegisterXPathFunction(ctxt, name, ns_uri, f)
608
609#
610# For the xmlTextReader parser configuration
611#
612PARSER_LOADDTD=1
613PARSER_DEFAULTATTRS=2
614PARSER_VALIDATE=3
615PARSER_SUBST_ENTITIES=4
616
617#
618# For the error callback severities
619#
620PARSER_SEVERITY_VALIDITY_WARNING=1
621PARSER_SEVERITY_VALIDITY_ERROR=2
622PARSER_SEVERITY_WARNING=3
623PARSER_SEVERITY_ERROR=4
624
625#
626# register the libxml2 error handler
627#
628def registerErrorHandler(f, ctx):
629    """Register a Python written function to for error reporting.
630       The function is called back as f(ctx, error). """
631    import sys
632    if 'libxslt' not in sys.modules:
633        # normal behaviour when libxslt is not imported
634        ret = libxml2mod.xmlRegisterErrorHandler(f,ctx)
635    else:
636        # when libxslt is already imported, one must
637        # use libxst's error handler instead
638        import libxslt
639        ret = libxslt.registerErrorHandler(f,ctx)
640    return ret
641
642class parserCtxtCore:
643
644    def __init__(self, _obj=None):
645        if _obj != None:
646            self._o = _obj;
647            return
648        self._o = None
649
650    def __del__(self):
651        if self._o != None:
652            libxml2mod.xmlFreeParserCtxt(self._o)
653        self._o = None
654
655    def setErrorHandler(self,f,arg):
656        """Register an error handler that will be called back as
657           f(arg,msg,severity,reserved).
658
659           @reserved is currently always None."""
660        libxml2mod.xmlParserCtxtSetErrorHandler(self._o,f,arg)
661
662    def getErrorHandler(self):
663        """Return (f,arg) as previously registered with setErrorHandler
664           or (None,None)."""
665        return libxml2mod.xmlParserCtxtGetErrorHandler(self._o)
666
667    def addLocalCatalog(self, uri):
668        """Register a local catalog with the parser"""
669        return libxml2mod.addLocalCatalog(self._o, uri)
670
671
672class ValidCtxtCore:
673
674    def __init__(self, *args, **kw):
675        pass
676
677    def setValidityErrorHandler(self, err_func, warn_func, arg=None):
678        """
679        Register error and warning handlers for DTD validation.
680        These will be called back as f(msg,arg)
681        """
682        libxml2mod.xmlSetValidErrors(self._o, err_func, warn_func, arg)
683
684
685class SchemaValidCtxtCore:
686
687    def __init__(self, *args, **kw):
688        pass
689
690    def setValidityErrorHandler(self, err_func, warn_func, arg=None):
691        """
692        Register error and warning handlers for Schema validation.
693        These will be called back as f(msg,arg)
694        """
695        libxml2mod.xmlSchemaSetValidErrors(self._o, err_func, warn_func, arg)
696
697
698class relaxNgValidCtxtCore:
699
700    def __init__(self, *args, **kw):
701        pass
702
703    def setValidityErrorHandler(self, err_func, warn_func, arg=None):
704        """
705        Register error and warning handlers for RelaxNG validation.
706        These will be called back as f(msg,arg)
707        """
708        libxml2mod.xmlRelaxNGSetValidErrors(self._o, err_func, warn_func, arg)
709
710
711def _xmlTextReaderErrorFunc(xxx_todo_changeme,msg,severity,locator):
712    """Intermediate callback to wrap the locator"""
713    (f,arg) = xxx_todo_changeme
714    return f(arg,msg,severity,xmlTextReaderLocator(locator))
715
716class xmlTextReaderCore:
717
718    def __init__(self, _obj=None):
719        self.input = None
720        if _obj != None:self._o = _obj;return
721        self._o = None
722
723    def __del__(self):
724        if self._o != None:
725            libxml2mod.xmlFreeTextReader(self._o)
726        self._o = None
727
728    def SetErrorHandler(self,f,arg):
729        """Register an error handler that will be called back as
730           f(arg,msg,severity,locator)."""
731        if f is None:
732            libxml2mod.xmlTextReaderSetErrorHandler(\
733                self._o,None,None)
734        else:
735            libxml2mod.xmlTextReaderSetErrorHandler(\
736                self._o,_xmlTextReaderErrorFunc,(f,arg))
737
738    def GetErrorHandler(self):
739        """Return (f,arg) as previously registered with setErrorHandler
740           or (None,None)."""
741        f,arg = libxml2mod.xmlTextReaderGetErrorHandler(self._o)
742        if f is None:
743            return None,None
744        else:
745            # assert f is _xmlTextReaderErrorFunc
746            return arg
747
748#
749# The cleanup now goes though a wrapper in libxml.c
750#
751def cleanupParser():
752    libxml2mod.xmlPythonCleanupParser()
753
754#
755# The interface to xmlRegisterInputCallbacks.
756# Since this API does not allow to pass a data object along with
757# match/open callbacks, it is necessary to maintain a list of all
758# Python callbacks.
759#
760__input_callbacks = []
761def registerInputCallback(func):
762    def findOpenCallback(URI):
763        for cb in reversed(__input_callbacks):
764            o = cb(URI)
765            if o is not None:
766                return o
767    libxml2mod.xmlRegisterInputCallback(findOpenCallback)
768    __input_callbacks.append(func)
769
770def popInputCallbacks():
771    # First pop python-level callbacks, when no more available - start
772    # popping built-in ones.
773    if len(__input_callbacks) > 0:
774        __input_callbacks.pop()
775    if len(__input_callbacks) == 0:
776        libxml2mod.xmlUnregisterInputCallback()
777
778# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
779#
780# Everything before this line comes from libxml.py
781# Everything after this line is automatically generated
782#
783# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
784
785