xmlreader.py revision 16f6329e6153c4b92f2175a5560e372a762befe6
1"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers
2should be based on this code. """
3
4import handler
5
6# ===== XMLREADER =====
7
8class XMLReader:
9    """Interface for reading an XML document using callbacks.
10
11    XMLReader is the interface that an XML parser's SAX2 driver must
12    implement. This interface allows an application to set and query
13    features and properties in the parser, to register event handlers
14    for document processing, and to initiate a document parse.
15
16    All SAX interfaces are assumed to be synchronous: the parse
17    methods must not return until parsing is complete, and readers
18    must wait for an event-handler callback to return before reporting
19    the next event."""
20
21    def __init__(self):
22        self._cont_handler = handler.ContentHandler()
23        self._dtd_handler = handler.DTDHandler()
24        self._ent_handler = handler.EntityResolver()
25        self._err_handler = handler.ErrorHandler()
26
27    def parse(self, source):
28        "Parse an XML document from a system identifier or an InputSource."
29        raise NotImplementedError("This method must be implemented!")
30
31    def getContentHandler(self):
32        "Returns the current ContentHandler."
33        return self._cont_handler
34
35    def setContentHandler(self, handler):
36        "Registers a new object to receive document content events."
37        self._cont_handler = handler
38
39    def getDTDHandler(self):
40        "Returns the current DTD handler."
41        return self._dtd_handler
42
43    def setDTDHandler(self, handler):
44        "Register an object to receive basic DTD-related events."
45        self._dtd_handler = handler
46
47    def getEntityResolver(self):
48        "Returns the current EntityResolver."
49        return self._ent_handler
50
51    def setEntityResolver(self, resolver):
52        "Register an object to resolve external entities."
53        self._ent_handler = resolver
54
55    def getErrorHandler(self):
56        "Returns the current ErrorHandler."
57        return self._err_handler
58
59    def setErrorHandler(self, handler):
60        "Register an object to receive error-message events."
61        self._err_handler = handler
62
63    def setLocale(self, locale):
64        """Allow an application to set the locale for errors and warnings.
65
66        SAX parsers are not required to provide localization for errors
67        and warnings; if they cannot support the requested locale,
68        however, they must throw a SAX exception. Applications may
69        request a locale change in the middle of a parse."""
70        raise SAXNotSupportedException("Locale support not implemented")
71
72    def getFeature(self, name):
73        "Looks up and returns the state of a SAX2 feature."
74        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
75
76    def setFeature(self, name, state):
77        "Sets the state of a SAX2 feature."
78        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
79
80    def getProperty(self, name):
81        "Looks up and returns the value of a SAX2 property."
82        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
83
84    def setProperty(self, name, value):
85        "Sets the value of a SAX2 property."
86        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
87
88class IncrementalParser(XMLReader):
89    """This interface adds three extra methods to the XMLReader
90    interface that allow XML parsers to support incremental
91    parsing. Support for this interface is optional, since not all
92    underlying XML parsers support this functionality.
93
94    When the parser is instantiated it is ready to begin accepting
95    data from the feed method immediately. After parsing has been
96    finished with a call to close the reset method must be called to
97    make the parser ready to accept new data, either from feed or
98    using the parse method.
99
100    Note that these methods must _not_ be called during parsing, that
101    is, after parse has been called and before it returns.
102
103    By default, the class also implements the parse method of the XMLReader
104    interface using the feed, close and reset methods of the
105    IncrementalParser interface as a convenience to SAX 2.0 driver
106    writers."""
107
108    def __init__(self, bufsize=2**16):
109        self._bufsize = bufsize
110        XMLReader.__init__(self)
111
112    def parse(self, source):
113        import saxutils
114        source = saxutils.prepare_input_source(source)
115
116        self.prepareParser(source)
117        file = source.getByteStream()
118        buffer = file.read(self._bufsize)
119        while buffer != "":
120            self.feed(buffer)
121            buffer = file.read(self._bufsize)
122        self.close()
123
124    def feed(self, data):
125        """This method gives the raw XML data in the data parameter to
126        the parser and makes it parse the data, emitting the
127        corresponding events. It is allowed for XML constructs to be
128        split across several calls to feed.
129
130        feed may raise SAXException."""
131        raise NotImplementedError("This method must be implemented!")
132
133    def prepareParser(self, source):
134        """This method is called by the parse implementation to allow
135        the SAX 2.0 driver to prepare itself for parsing."""
136        raise NotImplementedError("prepareParser must be overridden!")
137
138    def close(self):
139        """This method is called when the entire XML document has been
140        passed to the parser through the feed method, to notify the
141        parser that there are no more data. This allows the parser to
142        do the final checks on the document and empty the internal
143        data buffer.
144
145        The parser will not be ready to parse another document until
146        the reset method has been called.
147
148        close may raise SAXException."""
149        raise NotImplementedError("This method must be implemented!")
150
151    def reset(self):
152        """This method is called after close has been called to reset
153        the parser so that it is ready to parse new documents. The
154        results of calling parse or feed after close without calling
155        reset are undefined."""
156        raise NotImplementedError("This method must be implemented!")
157
158# ===== LOCATOR =====
159
160class Locator:
161    """Interface for associating a SAX event with a document
162    location. A locator object will return valid results only during
163    calls to DocumentHandler methods; at any other time, the
164    results are unpredictable."""
165
166    def getColumnNumber(self):
167        "Return the column number where the current event ends."
168        return -1
169
170    def getLineNumber(self):
171        "Return the line number where the current event ends."
172        return -1
173
174    def getPublicId(self):
175        "Return the public identifier for the current event."
176        return None
177
178    def getSystemId(self):
179        "Return the system identifier for the current event."
180        return None
181
182# ===== INPUTSOURCE =====
183
184class InputSource:
185    """Encapsulation of the information needed by the XMLReader to
186    read entities.
187
188    This class may include information about the public identifier,
189    system identifier, byte stream (possibly with character encoding
190    information) and/or the character stream of an entity.
191
192    Applications will create objects of this class for use in the
193    XMLReader.parse method and for returning from
194    EntityResolver.resolveEntity.
195
196    An InputSource belongs to the application, the XMLReader is not
197    allowed to modify InputSource objects passed to it from the
198    application, although it may make copies and modify those."""
199
200    def __init__(self, system_id = None):
201        self.__system_id = system_id
202        self.__public_id = None
203        self.__encoding  = None
204        self.__bytefile  = None
205        self.__charfile  = None
206
207    def setPublicId(self, public_id):
208        "Sets the public identifier of this InputSource."
209        self.__public_id = public_id
210
211    def getPublicId(self):
212        "Returns the public identifier of this InputSource."
213        return self.__public_id
214
215    def setSystemId(self, system_id):
216        "Sets the system identifier of this InputSource."
217        self.__system_id = system_id
218
219    def getSystemId(self):
220        "Returns the system identifier of this InputSource."
221        return self.__system_id
222
223    def setEncoding(self, encoding):
224        """Sets the character encoding of this InputSource.
225
226        The encoding must be a string acceptable for an XML encoding
227        declaration (see section 4.3.3 of the XML recommendation).
228
229        The encoding attribute of the InputSource is ignored if the
230        InputSource also contains a character stream."""
231        self.__encoding = encoding
232
233    def getEncoding(self):
234        "Get the character encoding of this InputSource."
235        return self.__encoding
236
237    def setByteStream(self, bytefile):
238        """Set the byte stream (a Python file-like object which does
239        not perform byte-to-character conversion) for this input
240        source.
241
242        The SAX parser will ignore this if there is also a character
243        stream specified, but it will use a byte stream in preference
244        to opening a URI connection itself.
245
246        If the application knows the character encoding of the byte
247        stream, it should set it with the setEncoding method."""
248        self.__bytefile = bytefile
249
250    def getByteStream(self):
251        """Get the byte stream for this input source.
252
253        The getEncoding method will return the character encoding for
254        this byte stream, or None if unknown."""
255        return self.__bytefile
256
257    def setCharacterStream(self, charfile):
258        """Set the character stream for this input source. (The stream
259        must be a Python 1.6 Unicode-wrapped file-like that performs
260        conversion to Unicode strings.)
261
262        If there is a character stream specified, the SAX parser will
263        ignore any byte stream and will not attempt to open a URI
264        connection to the system identifier."""
265        self.__charfile = charfile
266
267    def getCharacterStream(self):
268        "Get the character stream for this input source."
269        return self.__charfile
270
271# ===== ATTRIBUTESIMPL =====
272
273class AttributesImpl:
274
275    def __init__(self, attrs):
276        """Non-NS-aware implementation.
277
278        attrs should be of the form {name : value}."""
279        self._attrs = attrs
280
281    def getLength(self):
282        return len(self._attrs)
283
284    def getType(self, name):
285        return "CDATA"
286
287    def getValue(self, name):
288        return self._attrs[name]
289
290    def getValueByQName(self, name):
291        return self._attrs[name]
292
293    def getNameByQName(self, name):
294        if not self._attrs.has_key(name):
295            raise KeyError
296        return name
297
298    def getQNameByName(self, name):
299        if not self._attrs.has_key(name):
300            raise KeyError
301        return name
302
303    def getNames(self):
304        return self._attrs.keys()
305
306    def getQNames(self):
307        return self._attrs.keys()
308
309    def __len__(self):
310        return len(self._attrs)
311
312    def __getitem__(self, name):
313        return self._attrs[name]
314
315    def keys(self):
316        return self._attrs.keys()
317
318    def has_key(self, name):
319        return self._attrs.has_key(name)
320
321    def get(self, name, alternative=None):
322        return self._attrs.get(name, alternative)
323
324    def copy(self):
325        return self.__class__(self._attrs)
326
327    def items(self):
328        return self._attrs.items()
329
330    def values(self):
331        return self._attrs.values()
332
333# ===== ATTRIBUTESNSIMPL =====
334
335class AttributesNSImpl(AttributesImpl):
336
337    def __init__(self, attrs, qnames):
338        """NS-aware implementation.
339
340        attrs should be of the form {(ns_uri, lname): value, ...}.
341        qnames of the form {(ns_uri, lname): qname, ...}."""
342        self._attrs = attrs
343        self._qnames = qnames
344
345    def getValueByQName(self, name):
346        for (nsname, qname) in self._qnames.items():
347            if qname == name:
348                return self._attrs[nsname]
349
350        raise KeyError
351
352    def getNameByQName(self, name):
353        for (nsname, qname) in self._qnames.items():
354            if qname == name:
355                return nsname
356
357        raise KeyError
358
359    def getQNameByName(self, name):
360        return self._qnames[name]
361
362    def getQNames(self):
363        return self._qnames.values()
364
365    def copy(self):
366        return self.__class__(self._attrs, self._qnames)
367
368
369def _test():
370    XMLReader()
371    IncrementalParser()
372    Locator()
373
374if __name__ == "__main__":
375    _test()
376