libxml.py revision 37e63946dc2745dc1e546ac273e43512fb4685fc
1import libxml2mod
2import types
3
4#
5# Errors raised by the wrappers when some tree handling failed.
6#
7class treeError:
8    def __init__(self, msg):
9        self.msg = msg
10    def __str__(self):
11        return self.msg
12
13class parserError:
14    def __init__(self, msg):
15        self.msg = msg
16    def __str__(self):
17        return self.msg
18
19class uriError:
20    def __init__(self, msg):
21        self.msg = msg
22    def __str__(self):
23        return self.msg
24
25class xpathError:
26    def __init__(self, msg):
27        self.msg = msg
28    def __str__(self):
29        return self.msg
30
31class ioWrapper:
32    def __init__(self, _obj):
33        self.__io = _obj
34        self._o = None
35
36    def io_close(self):
37        if self.__io == None:
38            return(-1)
39        self.__io.close()
40        self.__io = None
41        return(0)
42
43    def io_flush(self):
44        if self.__io == None:
45            return(-1)
46        self.__io.flush()
47        return(0)
48
49    def io_read(self, len = -1):
50        if self.__io == None:
51            return(-1)
52        if len < 0:
53            return(self.__io.read())
54        return(self.__io.read(len))
55
56    def io_write(self, str, len = -1):
57        if self.__io == None:
58            return(-1)
59        if len < 0:
60            return(self.__io.write(str))
61        return(self.__io.write(str, len))
62
63class ioReadWrapper(ioWrapper):
64    def __init__(self, _obj, enc = ""):
65        ioWrapper.__init__(self, _obj)
66        self._o = libxml2mod.xmlCreateInputBuffer(self, enc)
67
68    def __del__(self):
69        print "__del__"
70        self.io_close()
71        if self._o != None:
72            libxml2mod.xmlFreeParserInputBuffer(self._o)
73        self._o = None
74
75    def close(self):
76        self.io_close()
77        if self._o != None:
78            libxml2mod.xmlFreeParserInputBuffer(self._o)
79        self._o = None
80
81class ioWriteWrapper(ioWrapper):
82    def __init__(self, _obj, enc = ""):
83#        print "ioWriteWrapper.__init__", _obj
84        if type(_obj) == type(''):
85            print "write io from a string"
86            self.o = None
87        elif type(_obj) == types.InstanceType:
88            print "write io from instance of %s" % (_obj.__class__)
89            ioWrapper.__init__(self, _obj)
90            self._o = libxml2mod.xmlCreateOutputBuffer(self, enc)
91        else:
92            file = libxml2mod.outputBufferGetPythonFile(_obj)
93            if file != None:
94                ioWrapper.__init__(self, file)
95            else:
96                ioWrapper.__init__(self, _obj)
97            self._o = _obj
98
99    def __del__(self):
100#        print "__del__"
101        self.io_close()
102        if self._o != None:
103            libxml2mod.xmlOutputBufferClose(self._o)
104        self._o = None
105
106    def flush(self):
107        self.io_flush()
108        if self._o != None:
109            libxml2mod.xmlOutputBufferClose(self._o)
110        self._o = None
111
112    def close(self):
113        self.io_flush()
114        if self._o != None:
115            libxml2mod.xmlOutputBufferClose(self._o)
116        self._o = None
117
118#
119# Example of a class to handle SAX events
120#
121class SAXCallback:
122    """Base class for SAX handlers"""
123    def startDocument(self):
124        """called at the start of the document"""
125        pass
126
127    def endDocument(self):
128        """called at the end of the document"""
129        pass
130
131    def startElement(self, tag, attrs):
132        """called at the start of every element, tag is the name of
133           the element, attrs is a dictionary of the element's attributes"""
134        pass
135
136    def endElement(self, tag):
137        """called at the start of every element, tag is the name of
138           the element"""
139        pass
140
141    def characters(self, data):
142        """called when character data have been read, data is the string
143           containing the data, multiple consecutive characters() callback
144           are possible."""
145        pass
146
147    def cdataBlock(self, data):
148        """called when CDATA section have been read, data is the string
149           containing the data, multiple consecutive cdataBlock() callback
150           are possible."""
151        pass
152
153    def reference(self, name):
154        """called when an entity reference has been found"""
155        pass
156
157    def ignorableWhitespace(self, data):
158        """called when potentially ignorable white spaces have been found"""
159        pass
160
161    def processingInstruction(self, target, data):
162        """called when a PI has been found, target contains the PI name and
163           data is the associated data in the PI"""
164        pass
165
166    def comment(self, content):
167        """called when a comment has been found, content contains the comment"""
168        pass
169
170    def externalSubset(self, name, externalID, systemID):
171        """called when a DOCTYPE declaration has been found, name is the
172           DTD name and externalID, systemID are the DTD public and system
173           identifier for that DTd if available"""
174        pass
175
176    def internalSubset(self, name, externalID, systemID):
177        """called when a DOCTYPE declaration has been found, name is the
178           DTD name and externalID, systemID are the DTD public and system
179           identifier for that DTD if available"""
180        pass
181
182    def entityDecl(self, name, type, externalID, systemID, content):
183        """called when an ENTITY declaration has been found, name is the
184           entity name and externalID, systemID are the entity public and
185           system identifier for that entity if available, type indicates
186           the entity type, and content reports it's string content"""
187        pass
188
189    def notationDecl(self, name, externalID, systemID):
190        """called when an NOTATION declaration has been found, name is the
191           notation name and externalID, systemID are the notation public and
192           system identifier for that notation if available"""
193        pass
194
195    def attributeDecl(self, elem, name, type, defi, defaultValue, nameList):
196        """called when an ATTRIBUTE definition has been found"""
197        pass
198
199    def elementDecl(self, name, type, content):
200        """called when an ELEMENT definition has been found"""
201        pass
202
203    def entityDecl(self, name, publicId, systemID, notationName):
204        """called when an unparsed ENTITY declaration has been found,
205           name is the entity name and publicId,, systemID are the entity
206           public and system identifier for that entity if available,
207           and notationName indicate the associated NOTATION"""
208        pass
209
210    def warning(self, msg):
211        print msg
212
213    def error(self, msg):
214        raise parserError(msg)
215
216    def fatalError(self, msg):
217        raise parserError(msg)
218
219#
220# This class is the ancestor of all the Node classes. It provides
221# the basic functionalities shared by all nodes (and handle
222# gracefylly the exception), like name, navigation in the tree,
223# doc reference, content access and serializing to a string or URI
224#
225class xmlCore:
226    def __init__(self, _obj=None):
227        if _obj != None:
228            self._o = _obj;
229            return
230        self._o = None
231    def get_parent(self):
232        ret = libxml2mod.parent(self._o)
233        if ret == None:
234            return None
235        return xmlNode(_obj=ret)
236    def get_children(self):
237        ret = libxml2mod.children(self._o)
238        if ret == None:
239            return None
240        return xmlNode(_obj=ret)
241    def get_last(self):
242        ret = libxml2mod.last(self._o)
243        if ret == None:
244            return None
245        return xmlNode(_obj=ret)
246    def get_next(self):
247        ret = libxml2mod.next(self._o)
248        if ret == None:
249            return None
250        return xmlNode(_obj=ret)
251    def get_properties(self):
252        ret = libxml2mod.properties(self._o)
253        if ret == None:
254            return None
255        return xmlAttr(_obj=ret)
256    def get_prev(self):
257        ret = libxml2mod.prev(self._o)
258        if ret == None:
259            return None
260        return xmlNode(_obj=ret)
261    def get_content(self):
262        return libxml2mod.xmlNodeGetContent(self._o)
263    getContent = get_content  # why is this duplicate naming needed ?
264    def get_name(self):
265        return libxml2mod.name(self._o)
266    def get_type(self):
267        return libxml2mod.type(self._o)
268    def get_doc(self):
269        ret = libxml2mod.doc(self._o)
270        if ret == None:
271            if self.type in ["document_xml", "document_html"]:
272                return xmlDoc(_obj=self._o)
273            else:
274                return None
275        return xmlDoc(_obj=ret)
276    #
277    # Those are common attributes to nearly all type of nodes
278    # defined as python2 properties
279    #
280    import sys
281    if float(sys.version[0:3]) < 2.2:
282        def __getattr__(self, attr):
283            if attr == "parent":
284                ret = libxml2mod.parent(self._o)
285                if ret == None:
286                    return None
287                return xmlNode(_obj=ret)
288            elif attr == "properties":
289                ret = libxml2mod.properties(self._o)
290                if ret == None:
291                    return None
292                return xmlAttr(_obj=ret)
293            elif attr == "children":
294                ret = libxml2mod.children(self._o)
295                if ret == None:
296                    return None
297                return xmlNode(_obj=ret)
298            elif attr == "last":
299                ret = libxml2mod.last(self._o)
300                if ret == None:
301                    return None
302                return xmlNode(_obj=ret)
303            elif attr == "next":
304                ret = libxml2mod.next(self._o)
305                if ret == None:
306                    return None
307                return xmlNode(_obj=ret)
308            elif attr == "prev":
309                ret = libxml2mod.prev(self._o)
310                if ret == None:
311                    return None
312                return xmlNode(_obj=ret)
313            elif attr == "content":
314                return libxml2mod.xmlNodeGetContent(self._o)
315            elif attr == "name":
316                return libxml2mod.name(self._o)
317            elif attr == "type":
318                return libxml2mod.type(self._o)
319            elif attr == "doc":
320                ret = libxml2mod.doc(self._o)
321                if ret == None:
322                    if self.type == "document_xml" or self.type == "document_html":
323                        return xmlDoc(_obj=self._o)
324                    else:
325                        return None
326                return xmlDoc(_obj=ret)
327            raise AttributeError,attr
328    else:
329        parent = property(get_parent, None, None, "Parent node")
330        children = property(get_children, None, None, "First child node")
331        last = property(get_last, None, None, "Last sibling node")
332        next = property(get_next, None, None, "Next sibling node")
333        prev = property(get_prev, None, None, "Previous sibling node")
334        properties = property(get_properties, None, None, "List of properies")
335        content = property(get_content, None, None, "Content of this node")
336        name = property(get_name, None, None, "Node name")
337        type = property(get_type, None, None, "Node type")
338        doc = property(get_doc, None, None, "The document this node belongs to")
339
340    #
341    # Serialization routines, the optional arguments have the following
342    # meaning:
343    #     encoding: string to ask saving in a specific encoding
344    #     indent: if 1 the serializer is asked to indent the output
345    #
346    def serialize(self, encoding = None, format = 0):
347        return libxml2mod.serializeNode(self._o, encoding, format)
348    def saveTo(self, file, encoding = None, format = 0):
349        return libxml2mod.saveNodeTo(self._o, file, encoding, format)
350
351    #
352    # Canonicalization routines:
353    #
354    #   nodes: the node set (tuple or list) to be included in the
355    #     canonized image or None if all document nodes should be
356    #     included.
357    #   exclusive: the exclusive flag (0 - non-exclusive
358    #     canonicalization; otherwise - exclusive canonicalization)
359    #   prefixes: the list of inclusive namespace prefixes (strings),
360    #     or None if there is no inclusive namespaces (only for
361    #     exclusive canonicalization, ignored otherwise)
362    #   with_comments: include comments in the result (!=0) or not
363    #     (==0)
364    def c14nMemory(self,
365                   nodes=None,
366                   exclusive=0,
367                   prefixes=None,
368                   with_comments=0):
369        if nodes:
370            nodes = map(lambda n: n._o, nodes)
371        return libxml2mod.xmlC14NDocDumpMemory(
372            self.get_doc()._o,
373            nodes,
374            exclusive != 0,
375            prefixes,
376            with_comments != 0)
377    def c14nSaveTo(self,
378                   file,
379                   nodes=None,
380                   exclusive=0,
381                   prefixes=None,
382                   with_comments=0):
383        if nodes:
384            nodes = map(lambda n: n._o, nodes)
385        return libxml2mod.xmlC14NDocSaveTo(
386            self.get_doc()._o,
387            nodes,
388            exclusive != 0,
389            prefixes,
390            with_comments != 0,
391            file)
392
393    #
394    # Selecting nodes using XPath, a bit slow because the context
395    # is allocated/freed every time but convenient.
396    #
397    def xpathEval(self, expr):
398        doc = self.doc
399        if doc == None:
400            return None
401        ctxt = doc.xpathNewContext()
402        ctxt.setContextNode(self)
403        res = ctxt.xpathEval(expr)
404        ctxt.xpathFreeContext()
405        return res
406
407#    #
408#    # Selecting nodes using XPath, faster because the context
409#    # is allocated just once per xmlDoc.
410#    #
411#    # Removed: DV memleaks c.f. #126735
412#    #
413#    def xpathEval2(self, expr):
414#        doc = self.doc
415#        if doc == None:
416#            return None
417#        try:
418#            doc._ctxt.setContextNode(self)
419#        except:
420#            doc._ctxt = doc.xpathNewContext()
421#            doc._ctxt.setContextNode(self)
422#        res = doc._ctxt.xpathEval(expr)
423#        return res
424    def xpathEval2(self, expr):
425        return self.xpathEval(expr)
426
427    # support for python2 iterators
428    def walk_depth_first(self):
429        return xmlCoreDepthFirstItertor(self)
430    def walk_breadth_first(self):
431        return xmlCoreBreadthFirstItertor(self)
432    __iter__ = walk_depth_first
433
434    def free(self):
435        try:
436            self.doc._ctxt.xpathFreeContext()
437        except:
438            pass
439        libxml2mod.xmlFreeDoc(self._o)
440
441
442#
443# implements the depth-first iterator for libxml2 DOM tree
444#
445class xmlCoreDepthFirstItertor:
446    def __init__(self, node):
447        self.node = node
448        self.parents = []
449    def __iter__(self):
450        return self
451    def next(self):
452        while 1:
453            if self.node:
454                ret = self.node
455                self.parents.append(self.node)
456                self.node = self.node.children
457                return ret
458            try:
459                parent = self.parents.pop()
460            except IndexError:
461                raise StopIteration
462            self.node = parent.next
463
464#
465# implements the breadth-first iterator for libxml2 DOM tree
466#
467class xmlCoreBreadthFirstItertor:
468    def __init__(self, node):
469        self.node = node
470        self.parents = []
471    def __iter__(self):
472        return self
473    def next(self):
474        while 1:
475            if self.node:
476                ret = self.node
477                self.parents.append(self.node)
478                self.node = self.node.next
479                return ret
480            try:
481                parent = self.parents.pop()
482            except IndexError:
483                raise StopIteration
484            self.node = parent.children
485
486#
487# converters to present a nicer view of the XPath returns
488#
489def nodeWrap(o):
490    # TODO try to cast to the most appropriate node class
491    name = libxml2mod.name(o)
492    if name == "element" or name == "text":
493        return xmlNode(_obj=o)
494    if name == "attribute":
495        return xmlAttr(_obj=o)
496    if name[0:8] == "document":
497        return xmlDoc(_obj=o)
498    if name[0:8] == "namespace":
499        return xmlNs(_obj=o)
500    if name == "elem_decl":
501        return xmlElement(_obj=o)
502    if name == "attribute_decl":
503        return xmlAtribute(_obj=o)
504    if name == "entity_decl":
505        return xmlEntity(_obj=o)
506    if name == "dtd":
507        return xmlDtd(_obj=o)
508    return xmlNode(_obj=o)
509
510def xpathObjectRet(o):
511    if type(o) == type([]) or type(o) == type(()):
512        ret = map(lambda x: nodeWrap(x), o)
513        return ret
514    return o
515
516#
517# register an XPath function
518#
519def registerXPathFunction(ctxt, name, ns_uri, f):
520    ret = libxml2mod.xmlRegisterXPathFunction(ctxt, name, ns_uri, f)
521
522#
523# For the xmlTextReader parser configuration
524#
525PARSER_LOADDTD=1
526PARSER_DEFAULTATTRS=2
527PARSER_VALIDATE=3
528PARSER_SUBST_ENTITIES=4
529
530#
531# For the error callback severities
532#
533PARSER_SEVERITY_VALIDITY_WARNING=1
534PARSER_SEVERITY_VALIDITY_ERROR=2
535PARSER_SEVERITY_WARNING=3
536PARSER_SEVERITY_ERROR=4
537
538#
539# register the libxml2 error handler
540#
541def registerErrorHandler(f, ctx):
542    """Register a Python written function to for error reporting.
543       The function is called back as f(ctx, error). """
544    import sys
545    if not sys.modules.has_key('libxslt'):
546        # normal behaviour when libxslt is not imported
547        ret = libxml2mod.xmlRegisterErrorHandler(f,ctx)
548    else:
549        # when libxslt is already imported, one must
550        # use libxst's error handler instead
551        import libxslt
552        ret = libxslt.registerErrorHandler(f,ctx)
553    return ret
554
555class parserCtxtCore:
556
557    def __init__(self, _obj=None):
558        if _obj != None:
559            self._o = _obj;
560            return
561        self._o = None
562
563    def __del__(self):
564        if self._o != None:
565            libxml2mod.xmlFreeParserCtxt(self._o)
566        self._o = None
567
568    def setErrorHandler(self,f,arg):
569        """Register an error handler that will be called back as
570           f(arg,msg,severity,reserved).
571
572           @reserved is currently always None."""
573        libxml2mod.xmlParserCtxtSetErrorHandler(self._o,f,arg)
574
575    def getErrorHandler(self):
576        """Return (f,arg) as previously registered with setErrorHandler
577           or (None,None)."""
578        return libxml2mod.xmlParserCtxtGetErrorHandler(self._o)
579
580    def addLocalCatalog(self, uri):
581        """Register a local catalog with the parser"""
582        return libxml2mod.addLocalCatalog(self._o, uri)
583
584
585def _xmlTextReaderErrorFunc((f,arg),msg,severity,locator):
586    """Intermediate callback to wrap the locator"""
587    return f(arg,msg,severity,xmlTextReaderLocator(locator))
588
589class xmlTextReaderCore:
590
591    def __init__(self, _obj=None):
592        self.input = None
593        if _obj != None:self._o = _obj;return
594        self._o = None
595
596    def __del__(self):
597        if self._o != None:
598            libxml2mod.xmlFreeTextReader(self._o)
599        self._o = None
600
601    def SetErrorHandler(self,f,arg):
602        """Register an error handler that will be called back as
603           f(arg,msg,severity,locator)."""
604        if f is None:
605            libxml2mod.xmlTextReaderSetErrorHandler(\
606                self._o,None,None)
607        else:
608            libxml2mod.xmlTextReaderSetErrorHandler(\
609                self._o,_xmlTextReaderErrorFunc,(f,arg))
610
611    def GetErrorHandler(self):
612        """Return (f,arg) as previously registered with setErrorHandler
613           or (None,None)."""
614        f,arg = libxml2mod.xmlTextReaderGetErrorHandler(self._o)
615        if f is None:
616            return None,None
617        else:
618            # assert f is _xmlTextReaderErrorFunc
619            return arg
620
621#
622# The cleanup now goes though a wrappe in libxml.c
623#
624def cleanupParser():
625    libxml2mod.xmlPythonCleanupParser()
626
627# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
628#
629# Everything before this line comes from libxml.py
630# Everything after this line is automatically generated
631#
632# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
633
634