expatreader.py revision da204daeaa39019f2134166308c5b9cdfa84b84a
1"""
2SAX driver for the Pyexpat C module.  This driver works with
3pyexpat.__version__ == '2.22'.
4"""
5
6version = "0.20"
7
8from xml.sax._exceptions import *
9
10# xml.parsers.expat does not raise ImportError in Jython
11import sys
12if sys.platform[:4] == "java":
13    raise SAXReaderNotAvailable("expat not available in Java", None)
14del sys
15
16try:
17    from xml.parsers import expat
18except ImportError:
19    raise SAXReaderNotAvailable("expat not supported", None)
20else:
21    if not hasattr(expat, "ParserCreate"):
22        raise SAXReaderNotAvailable("expat not supported", None)
23from xml.sax import xmlreader, saxutils, handler
24
25AttributesImpl = xmlreader.AttributesImpl
26AttributesNSImpl = xmlreader.AttributesNSImpl
27
28import string
29import weakref
30
31# --- ExpatLocator
32
33class ExpatLocator(xmlreader.Locator):
34    """Locator for use with the ExpatParser class.
35
36    This uses a weak reference to the parser object to avoid creating
37    a circular reference between the parser and the content handler.
38    """
39    def __init__(self, parser):
40        self._ref = weakref.ref(parser)
41
42    def getColumnNumber(self):
43        parser = self._ref()
44        if parser is None or parser._parser is None:
45            return None
46        return parser._parser.ErrorColumnNumber
47
48    def getLineNumber(self):
49        parser = self._ref()
50        if parser is None or parser._parser is None:
51            return 1
52        return parser._parser.ErrorLineNumber
53
54    def getPublicId(self):
55        parser = self._ref()
56        if parser is None:
57            return None
58        return parser._source.getPublicId()
59
60    def getSystemId(self):
61        parser = self._ref()
62        if parser is None:
63            return None
64        return parser._source.getSystemId()
65
66
67# --- ExpatParser
68
69class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
70    "SAX driver for the Pyexpat C module."
71
72    def __init__(self, namespaceHandling=0, bufsize=2**16-20):
73        xmlreader.IncrementalParser.__init__(self, bufsize)
74        self._source = xmlreader.InputSource()
75        self._parser = None
76        self._namespaces = namespaceHandling
77        self._lex_handler_prop = None
78        self._parsing = 0
79        self._entity_stack = []
80
81    # XMLReader methods
82
83    def parse(self, source):
84        "Parse an XML document from a URL or an InputSource."
85        source = saxutils.prepare_input_source(source)
86
87        self._source = source
88        self.reset()
89        self._cont_handler.setDocumentLocator(ExpatLocator(self))
90        xmlreader.IncrementalParser.parse(self, source)
91
92    def prepareParser(self, source):
93        if source.getSystemId() != None:
94            self._parser.SetBase(source.getSystemId())
95
96    # Redefined setContentHandle to allow changing handlers during parsing
97
98    def setContentHandler(self, handler):
99        xmlreader.IncrementalParser.setContentHandler(self, handler)
100        if self._parsing:
101            self._reset_cont_handler()
102
103    def getFeature(self, name):
104        if name == handler.feature_namespaces:
105            return self._namespaces
106        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
107
108    def setFeature(self, name, state):
109        if self._parsing:
110            raise SAXNotSupportedException("Cannot set features while parsing")
111        if name == handler.feature_namespaces:
112            self._namespaces = state
113        else:
114            raise SAXNotRecognizedException("Feature '%s' not recognized" %
115                                            name)
116
117    def getProperty(self, name):
118        if name == handler.property_lexical_handler:
119            return self._lex_handler_prop
120        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
121
122    def setProperty(self, name, value):
123        if name == handler.property_lexical_handler:
124            self._lex_handler_prop = value
125            if self._parsing:
126                self._reset_lex_handler_prop()
127        else:
128            raise SAXNotRecognizedException("Property '%s' not recognized" % name)
129
130    # IncrementalParser methods
131
132    def feed(self, data, isFinal = 0):
133        if not self._parsing:
134            self.reset()
135            self._parsing = 1
136            self._cont_handler.startDocument()
137
138        try:
139            # The isFinal parameter is internal to the expat reader.
140            # If it is set to true, expat will check validity of the entire
141            # document. When feeding chunks, they are not normally final -
142            # except when invoked from close.
143            self._parser.Parse(data, isFinal)
144        except expat.error:
145            error_code = self._parser.ErrorCode
146            exc = SAXParseException(expat.ErrorString(error_code), None, self)
147            # FIXME: when to invoke error()?
148            self._err_handler.fatalError(exc)
149
150    def close(self):
151        if self._entity_stack:
152            # If we are completing an external entity, do nothing here
153            return
154        self.feed("", isFinal = 1)
155        self._cont_handler.endDocument()
156        self._parsing = 0
157        # break cycle created by expat handlers pointing to our methods
158        self._parser = None
159
160    def _reset_cont_handler(self):
161        self._parser.ProcessingInstructionHandler = \
162                                    self._cont_handler.processingInstruction
163        self._parser.CharacterDataHandler = self._cont_handler.characters
164
165    def _reset_lex_handler_prop(self):
166        self._parser.CommentHandler = self._lex_handler_prop.comment
167        self._parser.StartCdataSectionHandler = self._lex_handler_prop.startCDATA
168        self._parser.EndCdataSectionHandler = self._lex_handler_prop.endCDATA
169
170    def reset(self):
171        if self._namespaces:
172            self._parser = expat.ParserCreate(None, " ")
173            self._parser.StartElementHandler = self.start_element_ns
174            self._parser.EndElementHandler = self.end_element_ns
175        else:
176            self._parser = expat.ParserCreate()
177            self._parser.StartElementHandler = self.start_element
178            self._parser.EndElementHandler = self.end_element
179
180        self._reset_cont_handler()
181        self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
182        self._parser.NotationDeclHandler = self.notation_decl
183        self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
184        self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
185
186        self._decl_handler_prop = None
187        if self._lex_handler_prop:
188            self._reset_lex_handler_prop()
189#         self._parser.DefaultHandler =
190#         self._parser.DefaultHandlerExpand =
191#         self._parser.NotStandaloneHandler =
192        self._parser.ExternalEntityRefHandler = self.external_entity_ref
193
194        self._parsing = 0
195        self._entity_stack = []
196
197    # Locator methods
198
199    def getColumnNumber(self):
200        if self._parser is None:
201            return None
202        return self._parser.ErrorColumnNumber
203
204    def getLineNumber(self):
205        if self._parser is None:
206            return 1
207        return self._parser.ErrorLineNumber
208
209    def getPublicId(self):
210        return self._source.getPublicId()
211
212    def getSystemId(self):
213        return self._source.getSystemId()
214
215    # event handlers
216    def start_element(self, name, attrs):
217        self._cont_handler.startElement(name, AttributesImpl(attrs))
218
219    def end_element(self, name):
220        self._cont_handler.endElement(name)
221
222    def start_element_ns(self, name, attrs):
223        pair = string.split(name)
224        if len(pair) == 1:
225            pair = (None, name)
226        else:
227            pair = tuple(pair)
228
229        newattrs = {}
230        for (aname, value) in attrs.items():
231            apair = string.split(aname)
232            if len(apair) == 1:
233                apair = (None, aname)
234            else:
235                apair = tuple(apair)
236
237            newattrs[apair] = value
238
239        self._cont_handler.startElementNS(pair, None,
240                                          AttributesNSImpl(newattrs, {}))
241
242    def end_element_ns(self, name):
243        pair = string.split(name)
244        if len(pair) == 1:
245            pair = (None, name)
246        else:
247            pair = tuple(pair)
248
249        self._cont_handler.endElementNS(pair, None)
250
251    # this is not used (call directly to ContentHandler)
252    def processing_instruction(self, target, data):
253        self._cont_handler.processingInstruction(target, data)
254
255    # this is not used (call directly to ContentHandler)
256    def character_data(self, data):
257        self._cont_handler.characters(data)
258
259    def start_namespace_decl(self, prefix, uri):
260        self._cont_handler.startPrefixMapping(prefix, uri)
261
262    def end_namespace_decl(self, prefix):
263        self._cont_handler.endPrefixMapping(prefix)
264
265    def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
266        self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
267
268    def notation_decl(self, name, base, sysid, pubid):
269        self._dtd_handler.notationDecl(name, pubid, sysid)
270
271    def external_entity_ref(self, context, base, sysid, pubid):
272        source = self._ent_handler.resolveEntity(pubid, sysid)
273        source = saxutils.prepare_input_source(source,
274                                               self._source.getSystemId() or
275                                               "")
276
277        self._entity_stack.append((self._parser, self._source))
278        self._parser = self._parser.ExternalEntityParserCreate(context)
279        self._source = source
280
281        try:
282            xmlreader.IncrementalParser.parse(self, source)
283        except:
284            return 0  # FIXME: save error info here?
285
286        (self._parser, self._source) = self._entity_stack[-1]
287        del self._entity_stack[-1]
288        return 1
289
290# ---
291
292def create_parser(*args, **kwargs):
293    return apply(ExpatParser, args, kwargs)
294
295# ---
296
297if __name__ == "__main__":
298    import xml.sax
299    p = create_parser()
300    p.setContentHandler(xml.sax.XMLGenerator())
301    p.setErrorHandler(xml.sax.ErrorHandler())
302    p.parse("../../../hamlet.xml")
303