1import xml.sax
2import xml.sax.handler
3import types
4
5try:
6    _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8    _StringTypes = [types.StringType]
9
10START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
18
19class PullDOM(xml.sax.ContentHandler):
20    _locator = None
21    document = None
22
23    def __init__(self, documentFactory=None):
24        from xml.dom import XML_NAMESPACE
25        self.documentFactory = documentFactory
26        self.firstEvent = [None, None]
27        self.lastEvent = self.firstEvent
28        self.elementStack = []
29        self.push = self.elementStack.append
30        try:
31            self.pop = self.elementStack.pop
32        except AttributeError:
33            # use class' pop instead
34            pass
35        self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
36        self._current_context = self._ns_contexts[-1]
37        self.pending_events = []
38
39    def pop(self):
40        result = self.elementStack[-1]
41        del self.elementStack[-1]
42        return result
43
44    def setDocumentLocator(self, locator):
45        self._locator = locator
46
47    def startPrefixMapping(self, prefix, uri):
48        if not hasattr(self, '_xmlns_attrs'):
49            self._xmlns_attrs = []
50        self._xmlns_attrs.append((prefix or 'xmlns', uri))
51        self._ns_contexts.append(self._current_context.copy())
52        self._current_context[uri] = prefix or None
53
54    def endPrefixMapping(self, prefix):
55        self._current_context = self._ns_contexts.pop()
56
57    def startElementNS(self, name, tagName , attrs):
58        # Retrieve xml namespace declaration attributes.
59        xmlns_uri = 'http://www.w3.org/2000/xmlns/'
60        xmlns_attrs = getattr(self, '_xmlns_attrs', None)
61        if xmlns_attrs is not None:
62            for aname, value in xmlns_attrs:
63                attrs._attrs[(xmlns_uri, aname)] = value
64            self._xmlns_attrs = []
65        uri, localname = name
66        if uri:
67            # When using namespaces, the reader may or may not
68            # provide us with the original name. If not, create
69            # *a* valid tagName from the current context.
70            if tagName is None:
71                prefix = self._current_context[uri]
72                if prefix:
73                    tagName = prefix + ":" + localname
74                else:
75                    tagName = localname
76            if self.document:
77                node = self.document.createElementNS(uri, tagName)
78            else:
79                node = self.buildDocument(uri, tagName)
80        else:
81            # When the tagname is not prefixed, it just appears as
82            # localname
83            if self.document:
84                node = self.document.createElement(localname)
85            else:
86                node = self.buildDocument(None, localname)
87
88        for aname,value in attrs.items():
89            a_uri, a_localname = aname
90            if a_uri == xmlns_uri:
91                if a_localname == 'xmlns':
92                    qname = a_localname
93                else:
94                    qname = 'xmlns:' + a_localname
95                attr = self.document.createAttributeNS(a_uri, qname)
96                node.setAttributeNodeNS(attr)
97            elif a_uri:
98                prefix = self._current_context[a_uri]
99                if prefix:
100                    qname = prefix + ":" + a_localname
101                else:
102                    qname = a_localname
103                attr = self.document.createAttributeNS(a_uri, qname)
104                node.setAttributeNodeNS(attr)
105            else:
106                attr = self.document.createAttribute(a_localname)
107                node.setAttributeNode(attr)
108            attr.value = value
109
110        self.lastEvent[1] = [(START_ELEMENT, node), None]
111        self.lastEvent = self.lastEvent[1]
112        self.push(node)
113
114    def endElementNS(self, name, tagName):
115        self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
116        self.lastEvent = self.lastEvent[1]
117
118    def startElement(self, name, attrs):
119        if self.document:
120            node = self.document.createElement(name)
121        else:
122            node = self.buildDocument(None, name)
123
124        for aname,value in attrs.items():
125            attr = self.document.createAttribute(aname)
126            attr.value = value
127            node.setAttributeNode(attr)
128
129        self.lastEvent[1] = [(START_ELEMENT, node), None]
130        self.lastEvent = self.lastEvent[1]
131        self.push(node)
132
133    def endElement(self, name):
134        self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
135        self.lastEvent = self.lastEvent[1]
136
137    def comment(self, s):
138        if self.document:
139            node = self.document.createComment(s)
140            self.lastEvent[1] = [(COMMENT, node), None]
141            self.lastEvent = self.lastEvent[1]
142        else:
143            event = [(COMMENT, s), None]
144            self.pending_events.append(event)
145
146    def processingInstruction(self, target, data):
147        if self.document:
148            node = self.document.createProcessingInstruction(target, data)
149            self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
150            self.lastEvent = self.lastEvent[1]
151        else:
152            event = [(PROCESSING_INSTRUCTION, target, data), None]
153            self.pending_events.append(event)
154
155    def ignorableWhitespace(self, chars):
156        node = self.document.createTextNode(chars)
157        self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
158        self.lastEvent = self.lastEvent[1]
159
160    def characters(self, chars):
161        node = self.document.createTextNode(chars)
162        self.lastEvent[1] = [(CHARACTERS, node), None]
163        self.lastEvent = self.lastEvent[1]
164
165    def startDocument(self):
166        if self.documentFactory is None:
167            import xml.dom.minidom
168            self.documentFactory = xml.dom.minidom.Document.implementation
169
170    def buildDocument(self, uri, tagname):
171        # Can't do that in startDocument, since we need the tagname
172        # XXX: obtain DocumentType
173        node = self.documentFactory.createDocument(uri, tagname, None)
174        self.document = node
175        self.lastEvent[1] = [(START_DOCUMENT, node), None]
176        self.lastEvent = self.lastEvent[1]
177        self.push(node)
178        # Put everything we have seen so far into the document
179        for e in self.pending_events:
180            if e[0][0] == PROCESSING_INSTRUCTION:
181                _,target,data = e[0]
182                n = self.document.createProcessingInstruction(target, data)
183                e[0] = (PROCESSING_INSTRUCTION, n)
184            elif e[0][0] == COMMENT:
185                n = self.document.createComment(e[0][1])
186                e[0] = (COMMENT, n)
187            else:
188                raise AssertionError("Unknown pending event ",e[0][0])
189            self.lastEvent[1] = e
190            self.lastEvent = e
191        self.pending_events = None
192        return node.firstChild
193
194    def endDocument(self):
195        self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
196        self.pop()
197
198    def clear(self):
199        "clear(): Explicitly release parsing structures"
200        self.document = None
201
202class ErrorHandler:
203    def warning(self, exception):
204        print exception
205    def error(self, exception):
206        raise exception
207    def fatalError(self, exception):
208        raise exception
209
210class DOMEventStream:
211    def __init__(self, stream, parser, bufsize):
212        self.stream = stream
213        self.parser = parser
214        self.bufsize = bufsize
215        if not hasattr(self.parser, 'feed'):
216            self.getEvent = self._slurp
217        self.reset()
218
219    def reset(self):
220        self.pulldom = PullDOM()
221        # This content handler relies on namespace support
222        self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
223        self.parser.setContentHandler(self.pulldom)
224
225    def __getitem__(self, pos):
226        rc = self.getEvent()
227        if rc:
228            return rc
229        raise IndexError
230
231    def next(self):
232        rc = self.getEvent()
233        if rc:
234            return rc
235        raise StopIteration
236
237    def __iter__(self):
238        return self
239
240    def expandNode(self, node):
241        event = self.getEvent()
242        parents = [node]
243        while event:
244            token, cur_node = event
245            if cur_node is node:
246                return
247            if token != END_ELEMENT:
248                parents[-1].appendChild(cur_node)
249            if token == START_ELEMENT:
250                parents.append(cur_node)
251            elif token == END_ELEMENT:
252                del parents[-1]
253            event = self.getEvent()
254
255    def getEvent(self):
256        # use IncrementalParser interface, so we get the desired
257        # pull effect
258        if not self.pulldom.firstEvent[1]:
259            self.pulldom.lastEvent = self.pulldom.firstEvent
260        while not self.pulldom.firstEvent[1]:
261            buf = self.stream.read(self.bufsize)
262            if not buf:
263                self.parser.close()
264                return None
265            self.parser.feed(buf)
266        rc = self.pulldom.firstEvent[1][0]
267        self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
268        return rc
269
270    def _slurp(self):
271        """ Fallback replacement for getEvent() using the
272            standard SAX2 interface, which means we slurp the
273            SAX events into memory (no performance gain, but
274            we are compatible to all SAX parsers).
275        """
276        self.parser.parse(self.stream)
277        self.getEvent = self._emit
278        return self._emit()
279
280    def _emit(self):
281        """ Fallback replacement for getEvent() that emits
282            the events that _slurp() read previously.
283        """
284        rc = self.pulldom.firstEvent[1][0]
285        self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
286        return rc
287
288    def clear(self):
289        """clear(): Explicitly release parsing objects"""
290        self.pulldom.clear()
291        del self.pulldom
292        self.parser = None
293        self.stream = None
294
295class SAX2DOM(PullDOM):
296
297    def startElementNS(self, name, tagName , attrs):
298        PullDOM.startElementNS(self, name, tagName, attrs)
299        curNode = self.elementStack[-1]
300        parentNode = self.elementStack[-2]
301        parentNode.appendChild(curNode)
302
303    def startElement(self, name, attrs):
304        PullDOM.startElement(self, name, attrs)
305        curNode = self.elementStack[-1]
306        parentNode = self.elementStack[-2]
307        parentNode.appendChild(curNode)
308
309    def processingInstruction(self, target, data):
310        PullDOM.processingInstruction(self, target, data)
311        node = self.lastEvent[0][1]
312        parentNode = self.elementStack[-1]
313        parentNode.appendChild(node)
314
315    def ignorableWhitespace(self, chars):
316        PullDOM.ignorableWhitespace(self, chars)
317        node = self.lastEvent[0][1]
318        parentNode = self.elementStack[-1]
319        parentNode.appendChild(node)
320
321    def characters(self, chars):
322        PullDOM.characters(self, chars)
323        node = self.lastEvent[0][1]
324        parentNode = self.elementStack[-1]
325        parentNode.appendChild(node)
326
327
328default_bufsize = (2 ** 14) - 20
329
330def parse(stream_or_string, parser=None, bufsize=None):
331    if bufsize is None:
332        bufsize = default_bufsize
333    if type(stream_or_string) in _StringTypes:
334        stream = open(stream_or_string)
335    else:
336        stream = stream_or_string
337    if not parser:
338        parser = xml.sax.make_parser()
339    return DOMEventStream(stream, parser, bufsize)
340
341def parseString(string, parser=None):
342    try:
343        from cStringIO import StringIO
344    except ImportError:
345        from StringIO import StringIO
346
347    bufsize = len(string)
348    buf = StringIO(string)
349    if not parser:
350        parser = xml.sax.make_parser()
351    return DOMEventStream(buf, parser, bufsize)
352