expatreader.py revision e3c37d660f5641f55c12313fde8e20f8178d942a
1"""
2SAX driver for the Pyexpat C module.  This driver works with
3pyexpat.__version__ == '2.22'.
4"""
5
6version = "0.20"
7
8from xml.sax._exceptions import *
9
10# xml.parsers.expat does not raise ImportError in Jython
11import sys
12if sys.platform[ : 4] == "java":
13    raise SAXReaderNotAvailable("expat not available in Java", None)
14del sys
15
16try:
17    from xml.parsers import expat
18except ImportError:
19    raise SAXReaderNotAvailable("expat not supported",None)
20else:
21    if not hasattr(expat, "ParserCreate"):
22        raise SAXReaderNotAvailable("expat not supported",None)
23from xml.sax import xmlreader, saxutils, handler
24
25AttributesImpl = xmlreader.AttributesImpl
26AttributesNSImpl = xmlreader.AttributesNSImpl
27
28import string
29
30# --- ExpatParser
31
32class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
33    "SAX driver for the Pyexpat C module."
34
35    def __init__(self, namespaceHandling=0, bufsize=2**16-20):
36        xmlreader.IncrementalParser.__init__(self, bufsize)
37        self._source = xmlreader.InputSource()
38        self._parser = None
39        self._namespaces = namespaceHandling
40        self._lex_handler_prop = None
41        self._parsing = 0
42        self._entity_stack = []
43
44    # XMLReader methods
45
46    def parse(self, source):
47        "Parse an XML document from a URL or an InputSource."
48        source = saxutils.prepare_input_source(source)
49
50        self._source = source
51        self.reset()
52        self._cont_handler.setDocumentLocator(self)
53        xmlreader.IncrementalParser.parse(self, source)
54
55    def prepareParser(self, source):
56        if source.getSystemId() != None:
57            self._parser.SetBase(source.getSystemId())
58
59    # Redefined setContentHandle to allow changing handlers during parsing
60
61    def setContentHandler(self, handler):
62        xmlreader.IncrementalParser.setContentHandler(self, handler)
63        if self._parsing:
64            self._reset_cont_handler()
65
66    def getFeature(self, name):
67        if name == handler.feature_namespaces:
68            return self._namespaces
69        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
70
71    def setFeature(self, name, state):
72        if self._parsing:
73            raise SAXNotSupportedException("Cannot set features while parsing")
74        if name == handler.feature_namespaces:
75            self._namespaces = state
76        else:
77            raise SAXNotRecognizedException("Feature '%s' not recognized" %
78                                            name)
79
80    def getProperty(self, name):
81        if name == handler.property_lexical_handler:
82            return self._lex_handler_prop
83        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
84
85    def setProperty(self, name, value):
86        if name == handler.property_lexical_handler:
87            self._lex_handler_prop = value
88            if self._parsing:
89                self._reset_lex_handler_prop()
90        else:
91            raise SAXNotRecognizedException("Property '%s' not recognized" % name)
92
93    # IncrementalParser methods
94
95    def feed(self, data, isFinal = 0):
96        if not self._parsing:
97            self.reset()
98            self._parsing = 1
99            self._cont_handler.startDocument()
100
101        try:
102            # The isFinal parameter is internal to the expat reader.
103            # If it is set to true, expat will check validity of the entire
104            # document. When feeding chunks, they are not normally final -
105            # except when invoked from close.
106            self._parser.Parse(data, isFinal)
107        except expat.error:
108            error_code = self._parser.ErrorCode
109            exc = SAXParseException(expat.ErrorString(error_code), None, self)
110            # FIXME: when to invoke error()?
111            self._err_handler.fatalError(exc)
112
113    def close(self):
114        if self._entity_stack:
115            # If we are completing an external entity, do nothing here
116            return
117        self.feed("", isFinal = 1)
118        self._cont_handler.endDocument()
119        self._parsing = 0
120        # break cycle created by expat handlers pointing to our methods
121        self._parser = None
122
123    def _reset_cont_handler(self):
124        self._parser.ProcessingInstructionHandler = \
125                                    self._cont_handler.processingInstruction
126        self._parser.CharacterDataHandler = self._cont_handler.characters
127
128    def _reset_lex_handler_prop(self):
129        self._parser.CommentHandler = self._lex_handler_prop.comment
130        self._parser.StartCdataSectionHandler = self._lex_handler_prop.startCDATA
131        self._parser.EndCdataSectionHandler = self._lex_handler_prop.endCDATA
132
133    def reset(self):
134        if self._namespaces:
135            self._parser = expat.ParserCreate(None, " ")
136            self._parser.StartElementHandler = self.start_element_ns
137            self._parser.EndElementHandler = self.end_element_ns
138        else:
139            self._parser = expat.ParserCreate()
140            self._parser.StartElementHandler = self.start_element
141            self._parser.EndElementHandler = self.end_element
142
143        self._reset_cont_handler()
144        self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
145        self._parser.NotationDeclHandler = self.notation_decl
146        self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
147        self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
148
149        self._decl_handler_prop = None
150        if self._lex_handler_prop:
151            self._reset_lex_handler_prop()
152#         self._parser.DefaultHandler =
153#         self._parser.DefaultHandlerExpand =
154#         self._parser.NotStandaloneHandler =
155        self._parser.ExternalEntityRefHandler = self.external_entity_ref
156
157        self._parsing = 0
158        self._entity_stack = []
159
160    # Locator methods
161
162    def getColumnNumber(self):
163        if self._parser is None:
164            return None
165        return self._parser.ErrorColumnNumber
166
167    def getLineNumber(self):
168        if self._parser is None:
169            return 1
170        return self._parser.ErrorLineNumber
171
172    def getPublicId(self):
173        return self._source.getPublicId()
174
175    def getSystemId(self):
176        return self._source.getSystemId()
177
178    # event handlers
179    def start_element(self, name, attrs):
180        self._cont_handler.startElement(name, AttributesImpl(attrs))
181
182    def end_element(self, name):
183        self._cont_handler.endElement(name)
184
185    def start_element_ns(self, name, attrs):
186        pair = string.split(name)
187        if len(pair) == 1:
188            pair = (None, name)
189        else:
190            pair = tuple(pair)
191
192        newattrs = {}
193        for (aname, value) in attrs.items():
194            apair = string.split(aname)
195            if len(apair) == 1:
196                apair = (None, aname)
197            else:
198                apair = tuple(apair)
199
200            newattrs[apair] = value
201
202        self._cont_handler.startElementNS(pair, None,
203                                          AttributesNSImpl(newattrs, {}))
204
205    def end_element_ns(self, name):
206        pair = string.split(name)
207        if len(pair) == 1:
208            pair = (None, name)
209        else:
210            pair = tuple(pair)
211
212        self._cont_handler.endElementNS(pair, None)
213
214    # this is not used (call directly to ContentHandler)
215    def processing_instruction(self, target, data):
216        self._cont_handler.processingInstruction(target, data)
217
218    # this is not used (call directly to ContentHandler)
219    def character_data(self, data):
220        self._cont_handler.characters(data)
221
222    def start_namespace_decl(self, prefix, uri):
223        self._cont_handler.startPrefixMapping(prefix, uri)
224
225    def end_namespace_decl(self, prefix):
226        self._cont_handler.endPrefixMapping(prefix)
227
228    def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
229        self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
230
231    def notation_decl(self, name, base, sysid, pubid):
232        self._dtd_handler.notationDecl(name, pubid, sysid)
233
234    def external_entity_ref(self, context, base, sysid, pubid):
235        source = self._ent_handler.resolveEntity(pubid, sysid)
236        source = saxutils.prepare_input_source(source,
237                                               self._source.getSystemId() or
238                                               "")
239
240        self._entity_stack.append((self._parser, self._source))
241        self._parser = self._parser.ExternalEntityParserCreate(context)
242        self._source = source
243
244        try:
245            xmlreader.IncrementalParser.parse(self, source)
246        except:
247            return 0  # FIXME: save error info here?
248
249        (self._parser, self._source) = self._entity_stack[-1]
250        del self._entity_stack[-1]
251        return 1
252
253# ---
254
255def create_parser(*args, **kwargs):
256    return apply(ExpatParser, args, kwargs)
257
258# ---
259
260if __name__ == "__main__":
261    import xml.sax
262    p = create_parser()
263    p.setContentHandler(xml.sax.XMLGenerator())
264    p.setErrorHandler(xml.sax.ErrorHandler())
265    p.parse("../../../hamlet.xml")
266