1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport xml.sax
2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport xml.sax.handler
3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport types
4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehtry:
6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _StringTypes = [types.StringType, types.UnicodeType]
7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehexcept AttributeError:
8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _StringTypes = [types.StringType]
9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehSTART_ELEMENT = "START_ELEMENT"
11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehEND_ELEMENT = "END_ELEMENT"
12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehCOMMENT = "COMMENT"
13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehSTART_DOCUMENT = "START_DOCUMENT"
14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehEND_DOCUMENT = "END_DOCUMENT"
15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehPROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehIGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehCHARACTERS = "CHARACTERS"
18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass PullDOM(xml.sax.ContentHandler):
20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _locator = None
21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    document = None
22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, documentFactory=None):
24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        from xml.dom import XML_NAMESPACE
25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.documentFactory = documentFactory
26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.firstEvent = [None, None]
27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.firstEvent
28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.elementStack = []
29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.push = self.elementStack.append
30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        try:
31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.pop = self.elementStack.pop
32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        except AttributeError:
33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # use class' pop instead
34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            pass
35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._current_context = self._ns_contexts[-1]
37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pending_events = []
38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def pop(self):
40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        result = self.elementStack[-1]
41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        del self.elementStack[-1]
42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return result
43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def setDocumentLocator(self, locator):
45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._locator = locator
46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def startPrefixMapping(self, prefix, uri):
48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not hasattr(self, '_xmlns_attrs'):
49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self._xmlns_attrs = []
50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._xmlns_attrs.append((prefix or 'xmlns', uri))
51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._ns_contexts.append(self._current_context.copy())
52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._current_context[uri] = prefix or None
53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def endPrefixMapping(self, prefix):
55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self._current_context = self._ns_contexts.pop()
56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def startElementNS(self, name, tagName , attrs):
58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Retrieve xml namespace declaration attributes.
59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        xmlns_uri = 'http://www.w3.org/2000/xmlns/'
60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        xmlns_attrs = getattr(self, '_xmlns_attrs', None)
61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if xmlns_attrs is not None:
62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for aname, value in xmlns_attrs:
63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                attrs._attrs[(xmlns_uri, aname)] = value
64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self._xmlns_attrs = []
65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        uri, localname = name
66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if uri:
67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # When using namespaces, the reader may or may not
68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # provide us with the original name. If not, create
69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # *a* valid tagName from the current context.
70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if tagName is None:
71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                prefix = self._current_context[uri]
72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if prefix:
73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    tagName = prefix + ":" + localname
74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    tagName = localname
76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if self.document:
77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node = self.document.createElementNS(uri, tagName)
78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node = self.buildDocument(uri, tagName)
80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # When the tagname is not prefixed, it just appears as
82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # localname
83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if self.document:
84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node = self.document.createElement(localname)
85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node = self.buildDocument(None, localname)
87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for aname,value in attrs.items():
89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            a_uri, a_localname = aname
90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if a_uri == xmlns_uri:
91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if a_localname == 'xmlns':
92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    qname = a_localname
93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    qname = 'xmlns:' + a_localname
95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                attr = self.document.createAttributeNS(a_uri, qname)
96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node.setAttributeNodeNS(attr)
97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif a_uri:
98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                prefix = self._current_context[a_uri]
99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if prefix:
100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    qname = prefix + ":" + a_localname
101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    qname = a_localname
103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                attr = self.document.createAttributeNS(a_uri, qname)
104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node.setAttributeNodeNS(attr)
105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                attr = self.document.createAttribute(a_localname)
107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                node.setAttributeNode(attr)
108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            attr.value = value
109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(START_ELEMENT, node), None]
111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.push(node)
113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def endElementNS(self, name, tagName):
115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def startElement(self, name, attrs):
119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.document:
120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            node = self.document.createElement(name)
121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            node = self.buildDocument(None, name)
123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for aname,value in attrs.items():
125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            attr = self.document.createAttribute(aname)
126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            attr.value = value
127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            node.setAttributeNode(attr)
128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(START_ELEMENT, node), None]
130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.push(node)
132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def endElement(self, name):
134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def comment(self, s):
138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.document:
139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            node = self.document.createComment(s)
140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.lastEvent[1] = [(COMMENT, node), None]
141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.lastEvent = self.lastEvent[1]
142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            event = [(COMMENT, s), None]
144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.pending_events.append(event)
145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def processingInstruction(self, target, data):
147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.document:
148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            node = self.document.createProcessingInstruction(target, data)
149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.lastEvent = self.lastEvent[1]
151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            event = [(PROCESSING_INSTRUCTION, target, data), None]
153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.pending_events.append(event)
154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def ignorableWhitespace(self, chars):
156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        node = self.document.createTextNode(chars)
157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def characters(self, chars):
161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        node = self.document.createTextNode(chars)
162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(CHARACTERS, node), None]
163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def startDocument(self):
166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.documentFactory is None:
167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            import xml.dom.minidom
168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.documentFactory = xml.dom.minidom.Document.implementation
169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def buildDocument(self, uri, tagname):
171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Can't do that in startDocument, since we need the tagname
172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # XXX: obtain DocumentType
173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        node = self.documentFactory.createDocument(uri, tagname, None)
174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.document = node
175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(START_DOCUMENT, node), None]
176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent = self.lastEvent[1]
177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.push(node)
178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # Put everything we have seen so far into the document
179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for e in self.pending_events:
180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if e[0][0] == PROCESSING_INSTRUCTION:
181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                _,target,data = e[0]
182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                n = self.document.createProcessingInstruction(target, data)
183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                e[0] = (PROCESSING_INSTRUCTION, n)
184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif e[0][0] == COMMENT:
185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                n = self.document.createComment(e[0][1])
186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                e[0] = (COMMENT, n)
187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                raise AssertionError("Unknown pending event ",e[0][0])
189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.lastEvent[1] = e
190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.lastEvent = e
191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pending_events = None
192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return node.firstChild
193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def endDocument(self):
195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pop()
197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def clear(self):
199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        "clear(): Explicitly release parsing structures"
200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.document = None
201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass ErrorHandler:
203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def warning(self, exception):
204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        print exception
205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def error(self, exception):
206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise exception
207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def fatalError(self, exception):
208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise exception
209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass DOMEventStream:
211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, stream, parser, bufsize):
212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.stream = stream
213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.parser = parser
214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.bufsize = bufsize
215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not hasattr(self.parser, 'feed'):
216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.getEvent = self._slurp
217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.reset()
218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def reset(self):
220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pulldom = PullDOM()
221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # This content handler relies on namespace support
222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.parser.setContentHandler(self.pulldom)
224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __getitem__(self, pos):
226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        rc = self.getEvent()
227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if rc:
228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return rc
229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise IndexError
230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def next(self):
232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        rc = self.getEvent()
233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if rc:
234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return rc
235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise StopIteration
236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __iter__(self):
238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return self
239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def expandNode(self, node):
241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        event = self.getEvent()
242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parents = [node]
243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while event:
244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            token, cur_node = event
245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if cur_node is node:
246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return
247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if token != END_ELEMENT:
248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                parents[-1].appendChild(cur_node)
249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if token == START_ELEMENT:
250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                parents.append(cur_node)
251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif token == END_ELEMENT:
252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                del parents[-1]
253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            event = self.getEvent()
254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def getEvent(self):
256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # use IncrementalParser interface, so we get the desired
257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # pull effect
258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not self.pulldom.firstEvent[1]:
259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.pulldom.lastEvent = self.pulldom.firstEvent
260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        while not self.pulldom.firstEvent[1]:
261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            buf = self.stream.read(self.bufsize)
262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if not buf:
263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                self.parser.close()
264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return None
265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.parser.feed(buf)
266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        rc = self.pulldom.firstEvent[1][0]
267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return rc
269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _slurp(self):
271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """ Fallback replacement for getEvent() using the
272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            standard SAX2 interface, which means we slurp the
273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            SAX events into memory (no performance gain, but
274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            we are compatible to all SAX parsers).
275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.parser.parse(self.stream)
277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.getEvent = self._emit
278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return self._emit()
279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _emit(self):
281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """ Fallback replacement for getEvent() that emits
282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            the events that _slurp() read previously.
283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        rc = self.pulldom.firstEvent[1][0]
285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return rc
287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def clear(self):
289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """clear(): Explicitly release parsing objects"""
290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.pulldom.clear()
291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        del self.pulldom
292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.parser = None
293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.stream = None
294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass SAX2DOM(PullDOM):
296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def startElementNS(self, name, tagName , attrs):
298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        PullDOM.startElementNS(self, name, tagName, attrs)
299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        curNode = self.elementStack[-1]
300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode = self.elementStack[-2]
301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode.appendChild(curNode)
302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def startElement(self, name, attrs):
304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        PullDOM.startElement(self, name, attrs)
305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        curNode = self.elementStack[-1]
306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode = self.elementStack[-2]
307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode.appendChild(curNode)
308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def processingInstruction(self, target, data):
310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        PullDOM.processingInstruction(self, target, data)
311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        node = self.lastEvent[0][1]
312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode = self.elementStack[-1]
313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode.appendChild(node)
314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def ignorableWhitespace(self, chars):
316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        PullDOM.ignorableWhitespace(self, chars)
317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        node = self.lastEvent[0][1]
318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode = self.elementStack[-1]
319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode.appendChild(node)
320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def characters(self, chars):
322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        PullDOM.characters(self, chars)
323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        node = self.lastEvent[0][1]
324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode = self.elementStack[-1]
325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parentNode.appendChild(node)
326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdefault_bufsize = (2 ** 14) - 20
329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef parse(stream_or_string, parser=None, bufsize=None):
331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if bufsize is None:
332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        bufsize = default_bufsize
333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if type(stream_or_string) in _StringTypes:
334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        stream = open(stream_or_string)
335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        stream = stream_or_string
337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not parser:
338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parser = xml.sax.make_parser()
339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return DOMEventStream(stream, parser, bufsize)
340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef parseString(string, parser=None):
342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    try:
343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        from cStringIO import StringIO
344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    except ImportError:
345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        from StringIO import StringIO
346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    bufsize = len(string)
348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    buf = StringIO(string)
349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not parser:
350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parser = xml.sax.make_parser()
351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return DOMEventStream(buf, parser, bufsize)
352