1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport xml.sax 2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport xml.sax.handler 3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport types 4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehtry: 6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _StringTypes = [types.StringType, types.UnicodeType] 7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehexcept AttributeError: 8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _StringTypes = [types.StringType] 9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehSTART_ELEMENT = "START_ELEMENT" 11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehEND_ELEMENT = "END_ELEMENT" 12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehCOMMENT = "COMMENT" 13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehSTART_DOCUMENT = "START_DOCUMENT" 14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehEND_DOCUMENT = "END_DOCUMENT" 15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehPROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" 16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehIGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" 17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehCHARACTERS = "CHARACTERS" 18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass PullDOM(xml.sax.ContentHandler): 20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _locator = None 21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh document = None 22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, documentFactory=None): 24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh from xml.dom import XML_NAMESPACE 25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.documentFactory = documentFactory 26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.firstEvent = [None, None] 27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.firstEvent 28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.elementStack = [] 29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.push = self.elementStack.append 30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pop = self.elementStack.pop 32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except AttributeError: 33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # use class' pop instead 34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts 36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._current_context = self._ns_contexts[-1] 37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pending_events = [] 38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def pop(self): 40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh result = self.elementStack[-1] 41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del self.elementStack[-1] 42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return result 43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def setDocumentLocator(self, locator): 45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._locator = locator 46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def startPrefixMapping(self, prefix, uri): 48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not hasattr(self, '_xmlns_attrs'): 49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._xmlns_attrs = [] 50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._xmlns_attrs.append((prefix or 'xmlns', uri)) 51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._ns_contexts.append(self._current_context.copy()) 52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._current_context[uri] = prefix or None 53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def endPrefixMapping(self, prefix): 55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._current_context = self._ns_contexts.pop() 56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def startElementNS(self, name, tagName , attrs): 58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Retrieve xml namespace declaration attributes. 59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh xmlns_uri = 'http://www.w3.org/2000/xmlns/' 60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh xmlns_attrs = getattr(self, '_xmlns_attrs', None) 61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if xmlns_attrs is not None: 62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for aname, value in xmlns_attrs: 63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attrs._attrs[(xmlns_uri, aname)] = value 64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._xmlns_attrs = [] 65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh uri, localname = name 66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if uri: 67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # When using namespaces, the reader may or may not 68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # provide us with the original name. If not, create 69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # *a* valid tagName from the current context. 70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if tagName is None: 71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix = self._current_context[uri] 72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if prefix: 73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh tagName = prefix + ":" + localname 74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh tagName = localname 76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.document: 77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createElementNS(uri, tagName) 78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.buildDocument(uri, tagName) 80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # When the tagname is not prefixed, it just appears as 82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # localname 83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.document: 84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createElement(localname) 85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.buildDocument(None, localname) 87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for aname,value in attrs.items(): 89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh a_uri, a_localname = aname 90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if a_uri == xmlns_uri: 91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if a_localname == 'xmlns': 92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh qname = a_localname 93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh qname = 'xmlns:' + a_localname 95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attr = self.document.createAttributeNS(a_uri, qname) 96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node.setAttributeNodeNS(attr) 97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif a_uri: 98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix = self._current_context[a_uri] 99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if prefix: 100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh qname = prefix + ":" + a_localname 101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh qname = a_localname 103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attr = self.document.createAttributeNS(a_uri, qname) 104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node.setAttributeNodeNS(attr) 105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attr = self.document.createAttribute(a_localname) 107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node.setAttributeNode(attr) 108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attr.value = value 109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(START_ELEMENT, node), None] 111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.push(node) 113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def endElementNS(self, name, tagName): 115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def startElement(self, name, attrs): 119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.document: 120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createElement(name) 121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.buildDocument(None, name) 123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for aname,value in attrs.items(): 125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attr = self.document.createAttribute(aname) 126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh attr.value = value 127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node.setAttributeNode(attr) 128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(START_ELEMENT, node), None] 130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.push(node) 132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def endElement(self, name): 134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def comment(self, s): 138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.document: 139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createComment(s) 140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(COMMENT, node), None] 141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh event = [(COMMENT, s), None] 144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pending_events.append(event) 145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def processingInstruction(self, target, data): 147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.document: 148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createProcessingInstruction(target, data) 149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] 150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh event = [(PROCESSING_INSTRUCTION, target, data), None] 153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pending_events.append(event) 154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def ignorableWhitespace(self, chars): 156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createTextNode(chars) 157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] 158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def characters(self, chars): 161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.document.createTextNode(chars) 162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(CHARACTERS, node), None] 163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def startDocument(self): 166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.documentFactory is None: 167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh import xml.dom.minidom 168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.documentFactory = xml.dom.minidom.Document.implementation 169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def buildDocument(self, uri, tagname): 171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Can't do that in startDocument, since we need the tagname 172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # XXX: obtain DocumentType 173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.documentFactory.createDocument(uri, tagname, None) 174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.document = node 175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(START_DOCUMENT, node), None] 176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = self.lastEvent[1] 177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.push(node) 178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Put everything we have seen so far into the document 179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for e in self.pending_events: 180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if e[0][0] == PROCESSING_INSTRUCTION: 181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _,target,data = e[0] 182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh n = self.document.createProcessingInstruction(target, data) 183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh e[0] = (PROCESSING_INSTRUCTION, n) 184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif e[0][0] == COMMENT: 185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh n = self.document.createComment(e[0][1]) 186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh e[0] = (COMMENT, n) 187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise AssertionError("Unknown pending event ",e[0][0]) 189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = e 190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent = e 191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pending_events = None 192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return node.firstChild 193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def endDocument(self): 195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.lastEvent[1] = [(END_DOCUMENT, self.document), None] 196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pop() 197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def clear(self): 199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh "clear(): Explicitly release parsing structures" 200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.document = None 201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass ErrorHandler: 203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def warning(self, exception): 204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh print exception 205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def error(self, exception): 206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise exception 207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def fatalError(self, exception): 208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise exception 209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass DOMEventStream: 211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, stream, parser, bufsize): 212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.stream = stream 213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser = parser 214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.bufsize = bufsize 215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not hasattr(self.parser, 'feed'): 216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.getEvent = self._slurp 217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.reset() 218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def reset(self): 220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pulldom = PullDOM() 221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # This content handler relies on namespace support 222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) 223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser.setContentHandler(self.pulldom) 224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __getitem__(self, pos): 226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh rc = self.getEvent() 227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if rc: 228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return rc 229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise IndexError 230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def next(self): 232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh rc = self.getEvent() 233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if rc: 234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return rc 235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise StopIteration 236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __iter__(self): 238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return self 239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def expandNode(self, node): 241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh event = self.getEvent() 242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parents = [node] 243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh while event: 244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh token, cur_node = event 245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if cur_node is node: 246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return 247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if token != END_ELEMENT: 248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parents[-1].appendChild(cur_node) 249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if token == START_ELEMENT: 250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parents.append(cur_node) 251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif token == END_ELEMENT: 252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del parents[-1] 253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh event = self.getEvent() 254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def getEvent(self): 256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # use IncrementalParser interface, so we get the desired 257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # pull effect 258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not self.pulldom.firstEvent[1]: 259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pulldom.lastEvent = self.pulldom.firstEvent 260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh while not self.pulldom.firstEvent[1]: 261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh buf = self.stream.read(self.bufsize) 262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not buf: 263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser.close() 264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return None 265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser.feed(buf) 266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh rc = self.pulldom.firstEvent[1][0] 267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return rc 269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def _slurp(self): 271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ Fallback replacement for getEvent() using the 272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh standard SAX2 interface, which means we slurp the 273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh SAX events into memory (no performance gain, but 274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh we are compatible to all SAX parsers). 275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser.parse(self.stream) 277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.getEvent = self._emit 278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return self._emit() 279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def _emit(self): 281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ Fallback replacement for getEvent() that emits 282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh the events that _slurp() read previously. 283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh rc = self.pulldom.firstEvent[1][0] 285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return rc 287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def clear(self): 289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """clear(): Explicitly release parsing objects""" 290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.pulldom.clear() 291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del self.pulldom 292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parser = None 293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.stream = None 294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass SAX2DOM(PullDOM): 296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def startElementNS(self, name, tagName , attrs): 298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh PullDOM.startElementNS(self, name, tagName, attrs) 299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh curNode = self.elementStack[-1] 300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode = self.elementStack[-2] 301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode.appendChild(curNode) 302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def startElement(self, name, attrs): 304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh PullDOM.startElement(self, name, attrs) 305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh curNode = self.elementStack[-1] 306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode = self.elementStack[-2] 307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode.appendChild(curNode) 308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def processingInstruction(self, target, data): 310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh PullDOM.processingInstruction(self, target, data) 311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.lastEvent[0][1] 312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode = self.elementStack[-1] 313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode.appendChild(node) 314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def ignorableWhitespace(self, chars): 316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh PullDOM.ignorableWhitespace(self, chars) 317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.lastEvent[0][1] 318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode = self.elementStack[-1] 319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode.appendChild(node) 320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def characters(self, chars): 322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh PullDOM.characters(self, chars) 323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh node = self.lastEvent[0][1] 324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode = self.elementStack[-1] 325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parentNode.appendChild(node) 326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdefault_bufsize = (2 ** 14) - 20 329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef parse(stream_or_string, parser=None, bufsize=None): 331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if bufsize is None: 332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh bufsize = default_bufsize 333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if type(stream_or_string) in _StringTypes: 334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh stream = open(stream_or_string) 335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh stream = stream_or_string 337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not parser: 338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parser = xml.sax.make_parser() 339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return DOMEventStream(stream, parser, bufsize) 340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef parseString(string, parser=None): 342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh from cStringIO import StringIO 344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except ImportError: 345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh from StringIO import StringIO 346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh bufsize = len(string) 348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh buf = StringIO(string) 349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not parser: 350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parser = xml.sax.make_parser() 351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return DOMEventStream(buf, parser, bufsize) 352