xmlreader.py revision 16f6329e6153c4b92f2175a5560e372a762befe6
1"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers 2should be based on this code. """ 3 4import handler 5 6# ===== XMLREADER ===== 7 8class XMLReader: 9 """Interface for reading an XML document using callbacks. 10 11 XMLReader is the interface that an XML parser's SAX2 driver must 12 implement. This interface allows an application to set and query 13 features and properties in the parser, to register event handlers 14 for document processing, and to initiate a document parse. 15 16 All SAX interfaces are assumed to be synchronous: the parse 17 methods must not return until parsing is complete, and readers 18 must wait for an event-handler callback to return before reporting 19 the next event.""" 20 21 def __init__(self): 22 self._cont_handler = handler.ContentHandler() 23 self._dtd_handler = handler.DTDHandler() 24 self._ent_handler = handler.EntityResolver() 25 self._err_handler = handler.ErrorHandler() 26 27 def parse(self, source): 28 "Parse an XML document from a system identifier or an InputSource." 29 raise NotImplementedError("This method must be implemented!") 30 31 def getContentHandler(self): 32 "Returns the current ContentHandler." 33 return self._cont_handler 34 35 def setContentHandler(self, handler): 36 "Registers a new object to receive document content events." 37 self._cont_handler = handler 38 39 def getDTDHandler(self): 40 "Returns the current DTD handler." 41 return self._dtd_handler 42 43 def setDTDHandler(self, handler): 44 "Register an object to receive basic DTD-related events." 45 self._dtd_handler = handler 46 47 def getEntityResolver(self): 48 "Returns the current EntityResolver." 49 return self._ent_handler 50 51 def setEntityResolver(self, resolver): 52 "Register an object to resolve external entities." 53 self._ent_handler = resolver 54 55 def getErrorHandler(self): 56 "Returns the current ErrorHandler." 57 return self._err_handler 58 59 def setErrorHandler(self, handler): 60 "Register an object to receive error-message events." 61 self._err_handler = handler 62 63 def setLocale(self, locale): 64 """Allow an application to set the locale for errors and warnings. 65 66 SAX parsers are not required to provide localization for errors 67 and warnings; if they cannot support the requested locale, 68 however, they must throw a SAX exception. Applications may 69 request a locale change in the middle of a parse.""" 70 raise SAXNotSupportedException("Locale support not implemented") 71 72 def getFeature(self, name): 73 "Looks up and returns the state of a SAX2 feature." 74 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 75 76 def setFeature(self, name, state): 77 "Sets the state of a SAX2 feature." 78 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 79 80 def getProperty(self, name): 81 "Looks up and returns the value of a SAX2 property." 82 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 83 84 def setProperty(self, name, value): 85 "Sets the value of a SAX2 property." 86 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 87 88class IncrementalParser(XMLReader): 89 """This interface adds three extra methods to the XMLReader 90 interface that allow XML parsers to support incremental 91 parsing. Support for this interface is optional, since not all 92 underlying XML parsers support this functionality. 93 94 When the parser is instantiated it is ready to begin accepting 95 data from the feed method immediately. After parsing has been 96 finished with a call to close the reset method must be called to 97 make the parser ready to accept new data, either from feed or 98 using the parse method. 99 100 Note that these methods must _not_ be called during parsing, that 101 is, after parse has been called and before it returns. 102 103 By default, the class also implements the parse method of the XMLReader 104 interface using the feed, close and reset methods of the 105 IncrementalParser interface as a convenience to SAX 2.0 driver 106 writers.""" 107 108 def __init__(self, bufsize=2**16): 109 self._bufsize = bufsize 110 XMLReader.__init__(self) 111 112 def parse(self, source): 113 import saxutils 114 source = saxutils.prepare_input_source(source) 115 116 self.prepareParser(source) 117 file = source.getByteStream() 118 buffer = file.read(self._bufsize) 119 while buffer != "": 120 self.feed(buffer) 121 buffer = file.read(self._bufsize) 122 self.close() 123 124 def feed(self, data): 125 """This method gives the raw XML data in the data parameter to 126 the parser and makes it parse the data, emitting the 127 corresponding events. It is allowed for XML constructs to be 128 split across several calls to feed. 129 130 feed may raise SAXException.""" 131 raise NotImplementedError("This method must be implemented!") 132 133 def prepareParser(self, source): 134 """This method is called by the parse implementation to allow 135 the SAX 2.0 driver to prepare itself for parsing.""" 136 raise NotImplementedError("prepareParser must be overridden!") 137 138 def close(self): 139 """This method is called when the entire XML document has been 140 passed to the parser through the feed method, to notify the 141 parser that there are no more data. This allows the parser to 142 do the final checks on the document and empty the internal 143 data buffer. 144 145 The parser will not be ready to parse another document until 146 the reset method has been called. 147 148 close may raise SAXException.""" 149 raise NotImplementedError("This method must be implemented!") 150 151 def reset(self): 152 """This method is called after close has been called to reset 153 the parser so that it is ready to parse new documents. The 154 results of calling parse or feed after close without calling 155 reset are undefined.""" 156 raise NotImplementedError("This method must be implemented!") 157 158# ===== LOCATOR ===== 159 160class Locator: 161 """Interface for associating a SAX event with a document 162 location. A locator object will return valid results only during 163 calls to DocumentHandler methods; at any other time, the 164 results are unpredictable.""" 165 166 def getColumnNumber(self): 167 "Return the column number where the current event ends." 168 return -1 169 170 def getLineNumber(self): 171 "Return the line number where the current event ends." 172 return -1 173 174 def getPublicId(self): 175 "Return the public identifier for the current event." 176 return None 177 178 def getSystemId(self): 179 "Return the system identifier for the current event." 180 return None 181 182# ===== INPUTSOURCE ===== 183 184class InputSource: 185 """Encapsulation of the information needed by the XMLReader to 186 read entities. 187 188 This class may include information about the public identifier, 189 system identifier, byte stream (possibly with character encoding 190 information) and/or the character stream of an entity. 191 192 Applications will create objects of this class for use in the 193 XMLReader.parse method and for returning from 194 EntityResolver.resolveEntity. 195 196 An InputSource belongs to the application, the XMLReader is not 197 allowed to modify InputSource objects passed to it from the 198 application, although it may make copies and modify those.""" 199 200 def __init__(self, system_id = None): 201 self.__system_id = system_id 202 self.__public_id = None 203 self.__encoding = None 204 self.__bytefile = None 205 self.__charfile = None 206 207 def setPublicId(self, public_id): 208 "Sets the public identifier of this InputSource." 209 self.__public_id = public_id 210 211 def getPublicId(self): 212 "Returns the public identifier of this InputSource." 213 return self.__public_id 214 215 def setSystemId(self, system_id): 216 "Sets the system identifier of this InputSource." 217 self.__system_id = system_id 218 219 def getSystemId(self): 220 "Returns the system identifier of this InputSource." 221 return self.__system_id 222 223 def setEncoding(self, encoding): 224 """Sets the character encoding of this InputSource. 225 226 The encoding must be a string acceptable for an XML encoding 227 declaration (see section 4.3.3 of the XML recommendation). 228 229 The encoding attribute of the InputSource is ignored if the 230 InputSource also contains a character stream.""" 231 self.__encoding = encoding 232 233 def getEncoding(self): 234 "Get the character encoding of this InputSource." 235 return self.__encoding 236 237 def setByteStream(self, bytefile): 238 """Set the byte stream (a Python file-like object which does 239 not perform byte-to-character conversion) for this input 240 source. 241 242 The SAX parser will ignore this if there is also a character 243 stream specified, but it will use a byte stream in preference 244 to opening a URI connection itself. 245 246 If the application knows the character encoding of the byte 247 stream, it should set it with the setEncoding method.""" 248 self.__bytefile = bytefile 249 250 def getByteStream(self): 251 """Get the byte stream for this input source. 252 253 The getEncoding method will return the character encoding for 254 this byte stream, or None if unknown.""" 255 return self.__bytefile 256 257 def setCharacterStream(self, charfile): 258 """Set the character stream for this input source. (The stream 259 must be a Python 1.6 Unicode-wrapped file-like that performs 260 conversion to Unicode strings.) 261 262 If there is a character stream specified, the SAX parser will 263 ignore any byte stream and will not attempt to open a URI 264 connection to the system identifier.""" 265 self.__charfile = charfile 266 267 def getCharacterStream(self): 268 "Get the character stream for this input source." 269 return self.__charfile 270 271# ===== ATTRIBUTESIMPL ===== 272 273class AttributesImpl: 274 275 def __init__(self, attrs): 276 """Non-NS-aware implementation. 277 278 attrs should be of the form {name : value}.""" 279 self._attrs = attrs 280 281 def getLength(self): 282 return len(self._attrs) 283 284 def getType(self, name): 285 return "CDATA" 286 287 def getValue(self, name): 288 return self._attrs[name] 289 290 def getValueByQName(self, name): 291 return self._attrs[name] 292 293 def getNameByQName(self, name): 294 if not self._attrs.has_key(name): 295 raise KeyError 296 return name 297 298 def getQNameByName(self, name): 299 if not self._attrs.has_key(name): 300 raise KeyError 301 return name 302 303 def getNames(self): 304 return self._attrs.keys() 305 306 def getQNames(self): 307 return self._attrs.keys() 308 309 def __len__(self): 310 return len(self._attrs) 311 312 def __getitem__(self, name): 313 return self._attrs[name] 314 315 def keys(self): 316 return self._attrs.keys() 317 318 def has_key(self, name): 319 return self._attrs.has_key(name) 320 321 def get(self, name, alternative=None): 322 return self._attrs.get(name, alternative) 323 324 def copy(self): 325 return self.__class__(self._attrs) 326 327 def items(self): 328 return self._attrs.items() 329 330 def values(self): 331 return self._attrs.values() 332 333# ===== ATTRIBUTESNSIMPL ===== 334 335class AttributesNSImpl(AttributesImpl): 336 337 def __init__(self, attrs, qnames): 338 """NS-aware implementation. 339 340 attrs should be of the form {(ns_uri, lname): value, ...}. 341 qnames of the form {(ns_uri, lname): qname, ...}.""" 342 self._attrs = attrs 343 self._qnames = qnames 344 345 def getValueByQName(self, name): 346 for (nsname, qname) in self._qnames.items(): 347 if qname == name: 348 return self._attrs[nsname] 349 350 raise KeyError 351 352 def getNameByQName(self, name): 353 for (nsname, qname) in self._qnames.items(): 354 if qname == name: 355 return nsname 356 357 raise KeyError 358 359 def getQNameByName(self, name): 360 return self._qnames[name] 361 362 def getQNames(self): 363 return self._qnames.values() 364 365 def copy(self): 366 return self.__class__(self._attrs, self._qnames) 367 368 369def _test(): 370 XMLReader() 371 IncrementalParser() 372 Locator() 373 374if __name__ == "__main__": 375 _test() 376