1fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis"""Facility to use the Expat parser to load a minidom instance
2fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisfrom a string or file.
3fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
4fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisThis avoids all the overhead of SAX and pulldom to gain performance.
5fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis"""
6fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
7fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# Warning!
8fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#
9fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# This module is tightly bound to the implementation details of the
10fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# minidom DOM and can't be used with other DOM implementations.  This
11fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# is due, in part, to a lack of appropriate methods in the DOM (there is
12fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# no way to create Entity and Notation nodes via the DOM Level 2
13204bf0b9aecd221c33f3e0909f261411783acf1bMartin Panter# interface), and for performance.  The latter is the cause of some fairly
14fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# cryptic code.
15fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#
16fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# Performance hacks:
17fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#
18fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#   -  .character_data_handler() has an extra case in which continuing
19fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      data is appended to an existing Text node; this can be a
20fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      speedup since pyexpat can break up character data into multiple
21fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      callbacks even though we set the buffer_text attribute on the
22fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      parser.  This also gives us the advantage that we don't need a
23fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      separate normalization pass.
24fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#
25fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#   -  Determining that a node exists is done using an identity comparison
26fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      with None rather than a truth test; this avoids searching for and
27fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      calling any methods on the node object if it exists.  (A rather
28fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis#      nice speedup is achieved this way as well!)
29fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
300e3f591aeeef9ed715f8770320f4c4c7332a8794Thomas Woutersfrom xml.dom import xmlbuilder, minidom, Node
310e3f591aeeef9ed715f8770320f4c4c7332a8794Thomas Woutersfrom xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
320e3f591aeeef9ed715f8770320f4c4c7332a8794Thomas Woutersfrom xml.parsers import expat
330e3f591aeeef9ed715f8770320f4c4c7332a8794Thomas Woutersfrom xml.dom.minidom import _append_child, _set_attribute_node
340e3f591aeeef9ed715f8770320f4c4c7332a8794Thomas Woutersfrom xml.dom.NodeFilter import NodeFilter
35fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
36fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisTEXT_NODE = Node.TEXT_NODE
37fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisCDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
38fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisDOCUMENT_NODE = Node.DOCUMENT_NODE
39fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
40fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisFILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
41fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisFILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
42fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisFILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
43fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwisFILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
44fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
45fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. LöwistheDOMImplementation = minidom.getDOMImplementation()
46fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
47fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# Expat typename -> TypeInfo
48fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis_typeinfo_map = {
49fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "CDATA":    minidom.TypeInfo(None, "cdata"),
50fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "ENUM":     minidom.TypeInfo(None, "enumeration"),
51fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "ENTITY":   minidom.TypeInfo(None, "entity"),
52fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "ENTITIES": minidom.TypeInfo(None, "entities"),
53fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "ID":       minidom.TypeInfo(None, "id"),
54fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "IDREF":    minidom.TypeInfo(None, "idref"),
55fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "IDREFS":   minidom.TypeInfo(None, "idrefs"),
56fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
57fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
58fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    }
59fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
6049fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersclass ElementInfo(object):
61fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    __slots__ = '_attr_info', '_model', 'tagName'
62fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
63fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __init__(self, tagName, model=None):
64fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.tagName = tagName
65fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._attr_info = []
66fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._model = model
67fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
68fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __getstate__(self):
69fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return self._attr_info, self._model, self.tagName
70fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
71fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __setstate__(self, state):
72fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._attr_info, self._model, self.tagName = state
73fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
74fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def getAttributeType(self, aname):
75fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        for info in self._attr_info:
76fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if info[1] == aname:
77fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                t = info[-2]
78fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if t[0] == "(":
79fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    return _typeinfo_map["ENUM"]
80fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                else:
81fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    return _typeinfo_map[info[-2]]
82fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return minidom._no_type
83fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
84fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def getAttributeTypeNS(self, namespaceURI, localName):
85fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return minidom._no_type
86fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
87fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def isElementContent(self):
88fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._model:
89fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            type = self._model[0]
90fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return type not in (expat.model.XML_CTYPE_ANY,
91fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                expat.model.XML_CTYPE_MIXED)
92fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
93fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return False
94fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
95fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def isEmpty(self):
96fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._model:
97fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return self._model[0] == expat.model.XML_CTYPE_EMPTY
98fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
99fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return False
100fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
101fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def isId(self, aname):
102fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        for info in self._attr_info:
103fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if info[1] == aname:
104fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                return info[-2] == "ID"
105fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return False
106fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
107fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def isIdNS(self, euri, ename, auri, aname):
108fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # not sure this is meaningful
109fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return self.isId((auri, aname))
110fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
111fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisdef _intern(builder, s):
112fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    return builder._intern_setdefault(s, s)
113fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
114fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisdef _parse_ns_name(builder, name):
115fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    assert ' ' in name
116fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    parts = name.split(' ')
117fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    intern = builder._intern_setdefault
118fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if len(parts) == 3:
119fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        uri, localname, prefix = parts
120fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        prefix = intern(prefix, prefix)
121fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        qname = "%s:%s" % (prefix, localname)
122fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        qname = intern(qname, qname)
123fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        localname = intern(localname, localname)
1249077d24d7f85e09e53def11b2beeaf40749e2464R David Murray    elif len(parts) == 2:
125fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        uri, localname = parts
126fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        prefix = EMPTY_PREFIX
127fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        qname = localname = intern(localname, localname)
1289077d24d7f85e09e53def11b2beeaf40749e2464R David Murray    else:
1299077d24d7f85e09e53def11b2beeaf40749e2464R David Murray        raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
130fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    return intern(uri, uri), localname, prefix, qname
131fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
132fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
133fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass ExpatBuilder:
134fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Document builder that uses Expat to build a ParsedXML.DOM document
135fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    instance."""
136fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
137fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __init__(self, options=None):
138fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if options is None:
139fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            options = xmlbuilder.Options()
140fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._options = options
141fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._options.filter is not None:
142fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._filter = FilterVisibilityController(self._options.filter)
143fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
144fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._filter = None
145fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # This *really* doesn't do anything in this case, so
146fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # override it with something fast & minimal.
147fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._finish_start_element = id
148fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._parser = None
149fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.reset()
150fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
151fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def createParser(self):
152fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Create a new parser object."""
153fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return expat.ParserCreate()
154fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
155fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def getParser(self):
156fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Return the parser object, creating a new one if needed."""
157fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if not self._parser:
158fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser = self.createParser()
159fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._intern_setdefault = self._parser.intern.setdefault
160fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.buffer_text = True
161fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.ordered_attributes = True
162fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.specified_attributes = True
163fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.install(self._parser)
164fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return self._parser
165fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
166fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def reset(self):
167fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Free all data structures used during DOM construction."""
168fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.document = theDOMImplementation.createDocument(
169fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            EMPTY_NAMESPACE, None, None)
170fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.curNode = self.document
171fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._elem_info = self.document._elem_info
172fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._cdata = False
173fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
174fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def install(self, parser):
175fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Install the callbacks needed to build the DOM into the parser."""
176fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # This creates circular references!
177fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.StartElementHandler = self.first_element_handler
179fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.EndElementHandler = self.end_element_handler
180fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.ProcessingInstructionHandler = self.pi_handler
181fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._options.entities:
182fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.EntityDeclHandler = self.entity_decl_handler
183fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.NotationDeclHandler = self.notation_decl_handler
184fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._options.comments:
185fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.CommentHandler = self.comment_handler
186fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._options.cdata_sections:
187fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.StartCdataSectionHandler = self.start_cdata_section_handler
188fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.EndCdataSectionHandler = self.end_cdata_section_handler
189fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.CharacterDataHandler = self.character_data_handler_cdata
190fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
191fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.CharacterDataHandler = self.character_data_handler
192fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.XmlDeclHandler = self.xml_decl_handler
194fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.ElementDeclHandler = self.element_decl_handler
195fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.AttlistDeclHandler = self.attlist_decl_handler
196fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
197fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def parseFile(self, file):
198fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Parse a document from a file object, returning the document
199fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node."""
200fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser = self.getParser()
201fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        first_buffer = True
202fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        try:
203fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            while 1:
204fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                buffer = file.read(16*1024)
205fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if not buffer:
206fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    break
207fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                parser.Parse(buffer, 0)
208fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if first_buffer and self.document.documentElement:
209fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    self._setup_subset(buffer)
210fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                first_buffer = False
211fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.Parse("", True)
212fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        except ParseEscape:
213fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            pass
214fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        doc = self.document
215fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.reset()
216fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._parser = None
217fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return doc
218fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
219fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def parseString(self, string):
220fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Parse a document from a string, returning the document node."""
221fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser = self.getParser()
222fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        try:
223fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.Parse(string, True)
224fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._setup_subset(string)
225fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        except ParseEscape:
226fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            pass
227fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        doc = self.document
228fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.reset()
229fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._parser = None
230fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return doc
231fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
232fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _setup_subset(self, buffer):
233fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Load the internal subset if there might be one."""
234fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self.document.doctype:
235fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            extractor = InternalSubsetExtractor()
236fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            extractor.parseString(buffer)
237fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            subset = extractor.getSubset()
238fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.document.doctype.internalSubset = subset
239fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
240fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
241fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                   has_internal_subset):
242fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        doctype = self.document.implementation.createDocumentType(
243fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            doctypeName, publicId, systemId)
244fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        doctype.ownerDocument = self.document
245297d97241a065fe2a341fe5f340c81566b762142Georg Brandl        _append_child(self.document, doctype)
246fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.document.doctype = doctype
247fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
248fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.document.doctype = None
249fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            del self.document.childNodes[-1]
250fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            doctype = None
251fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.EntityDeclHandler = None
252fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.NotationDeclHandler = None
253fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if has_internal_subset:
254fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if doctype is not None:
255fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                doctype.entities._seq = []
256fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                doctype.notations._seq = []
257fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.CommentHandler = None
258fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.ProcessingInstructionHandler = None
259fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
260fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
261fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def end_doctype_decl_handler(self):
262fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._options.comments:
263fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._parser.CommentHandler = self.comment_handler
264fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._parser.ProcessingInstructionHandler = self.pi_handler
265fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if not (self._elem_info or self._filter):
266fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._finish_end_element = id
267fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
268fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def pi_handler(self, target, data):
269fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = self.document.createProcessingInstruction(target, data)
270fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        _append_child(self.curNode, node)
271fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
272bc05fc5d2b970a18686c71479f28372a24f97190Neal Norwitz            self.curNode.removeChild(node)
273fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
274fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def character_data_handler_cdata(self, data):
275fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        childNodes = self.curNode.childNodes
276fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._cdata:
277fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if (  self._cdata_continue
278fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                  and childNodes[-1].nodeType == CDATA_SECTION_NODE):
279fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                childNodes[-1].appendData(data)
280fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                return
281fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node = self.document.createCDATASection(data)
282fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._cdata_continue = True
283fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
284fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node = childNodes[-1]
285fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            value = node.data + data
28614aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis            node.data = value
287fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return
288fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
289fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node = minidom.Text()
29014aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis            node.data = data
29114aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis            node.ownerDocument = self.document
292fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        _append_child(self.curNode, node)
293fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
294fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def character_data_handler(self, data):
295fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        childNodes = self.curNode.childNodes
296fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if childNodes and childNodes[-1].nodeType == TEXT_NODE:
297fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node = childNodes[-1]
29814aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis            node.data = node.data + data
299fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return
300fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = minidom.Text()
30114aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis        node.data = node.data + data
30214aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis        node.ownerDocument = self.document
303fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        _append_child(self.curNode, node)
304fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
305fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def entity_decl_handler(self, entityName, is_parameter_entity, value,
306fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                            base, systemId, publicId, notationName):
307fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if is_parameter_entity:
308fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # we don't care about parameter entities for the DOM
309fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return
310fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if not self._options.entities:
311fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return
312fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = self.document._create_entity(entityName, publicId,
313fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                            systemId, notationName)
314fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if value is not None:
315fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # internal entity
316fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # node *should* be readonly, but we'll cheat
317fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            child = self.document.createTextNode(value)
318fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node.childNodes.append(child)
319fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.document.doctype.entities._seq.append(node)
320fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
321fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            del self.document.doctype.entities._seq[-1]
322fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
323fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def notation_decl_handler(self, notationName, base, systemId, publicId):
324fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = self.document._create_notation(notationName, publicId, systemId)
325fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.document.doctype.notations._seq.append(node)
326fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
327fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            del self.document.doctype.notations._seq[-1]
328fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
329fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def comment_handler(self, data):
330fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = self.document.createComment(data)
331fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        _append_child(self.curNode, node)
332fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
333041411a1c70e0e01fb32864359990d4fd3a20f97Martin v. Löwis            self.curNode.removeChild(node)
334fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
335fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_cdata_section_handler(self):
336fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._cdata = True
337fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._cdata_continue = False
338fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
339fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def end_cdata_section_handler(self):
340fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._cdata = False
341fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._cdata_continue = False
342fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
343fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def external_entity_ref_handler(self, context, base, systemId, publicId):
344fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return 1
345fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
346fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def first_element_handler(self, name, attributes):
347fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter is None and not self._elem_info:
348fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._finish_end_element = id
349fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.getParser().StartElementHandler = self.start_element_handler
350fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.start_element_handler(name, attributes)
351fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
352fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_element_handler(self, name, attributes):
353fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = self.document.createElement(name)
354fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        _append_child(self.curNode, node)
355fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.curNode = node
356fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
357fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if attributes:
358fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            for i in range(0, len(attributes), 2):
359fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
360fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                 None, EMPTY_PREFIX)
361fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                value = attributes[i+1]
36214aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis                a.value = value
36314aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis                a.ownerDocument = self.document
364fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                _set_attribute_node(node, a)
365fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
366fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if node is not self.document.documentElement:
367fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._finish_start_element(node)
368fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
369fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _finish_start_element(self, node):
370fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter:
371fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # To be general, we'd have to call isSameNode(), but this
372fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # is sufficient for minidom:
373fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if node is self.document.documentElement:
374fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                return
375fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            filt = self._filter.startContainer(node)
376fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if filt == FILTER_REJECT:
377fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                # ignore this node & all descendents
378fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                Rejecter(self)
379fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            elif filt == FILTER_SKIP:
380fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                # ignore this node, but make it's children become
381fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                # children of the parent node
382fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                Skipper(self)
383fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            else:
384fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                return
385fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.curNode = node.parentNode
386fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node.parentNode.removeChild(node)
387fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node.unlink()
388fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
389fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    # If this ever changes, Namespaces.end_element_handler() needs to
390fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    # be changed to match.
391fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    #
392fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def end_element_handler(self, name):
393fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        curNode = self.curNode
394fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.curNode = curNode.parentNode
395fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._finish_end_element(curNode)
396fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
397fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _finish_end_element(self, curNode):
398fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        info = self._elem_info.get(curNode.tagName)
399fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if info:
400fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._handle_white_text_nodes(curNode, info)
401fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._filter:
402fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if curNode is self.document.documentElement:
403fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                return
404fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if self._filter.acceptNode(curNode) == FILTER_REJECT:
405fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self.curNode.removeChild(curNode)
406fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                curNode.unlink()
407fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
408fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _handle_white_text_nodes(self, node, info):
409fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if (self._options.whitespace_in_element_content
410fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            or not info.isElementContent()):
411fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return
412fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
413fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # We have element type information and should remove ignorable
414fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # whitespace; identify for text nodes which contain only
415fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # whitespace.
416fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        L = []
417fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        for child in node.childNodes:
418fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if child.nodeType == TEXT_NODE and not child.data.strip():
419fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                L.append(child)
420fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
421fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # Remove ignorable whitespace from the tree.
422fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        for child in L:
423fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            node.removeChild(child)
424fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
425fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def element_decl_handler(self, name, model):
426fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        info = self._elem_info.get(name)
427fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if info is None:
428fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._elem_info[name] = ElementInfo(name, model)
429fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
430fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            assert info._model is None
431fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            info._model = model
432fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
433fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def attlist_decl_handler(self, elem, name, type, default, required):
434fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        info = self._elem_info.get(elem)
435fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if info is None:
436fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            info = ElementInfo(elem)
437fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._elem_info[elem] = info
438fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        info._attr_info.append(
439fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            [None, name, None, None, default, 0, type, required])
440fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
441fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def xml_decl_handler(self, version, encoding, standalone):
442fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.document.version = version
443fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.document.encoding = encoding
444fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # This is still a little ugly, thanks to the pyexpat API. ;-(
445fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if standalone >= 0:
446fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if standalone:
447fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self.document.standalone = True
448fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            else:
449fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self.document.standalone = False
450fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
451fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
452fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# Don't include FILTER_INTERRUPT, since that's checked separately
453fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# where allowed.
454fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
455fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
45649fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersclass FilterVisibilityController(object):
457fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Wrapper around a DOMBuilderFilter which implements the checks
458fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    to make the whatToShow filter attribute work."""
459fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
460fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    __slots__ = 'filter',
461fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
462fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __init__(self, filter):
463fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.filter = filter
464fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
465fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def startContainer(self, node):
466fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        mask = self._nodetype_mask[node.nodeType]
467fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self.filter.whatToShow & mask:
468fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            val = self.filter.startContainer(node)
469fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if val == FILTER_INTERRUPT:
470fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                raise ParseEscape
471fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if val not in _ALLOWED_FILTER_RETURNS:
47270e79803fcc93e19808faa240a5f5e4854d0b077Collin Winter                raise ValueError(
47370e79803fcc93e19808faa240a5f5e4854d0b077Collin Winter                      "startContainer() returned illegal value: " + repr(val))
474fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return val
475fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
476fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return FILTER_ACCEPT
477fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
478fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def acceptNode(self, node):
479fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        mask = self._nodetype_mask[node.nodeType]
480fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self.filter.whatToShow & mask:
481fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            val = self.filter.acceptNode(node)
482fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if val == FILTER_INTERRUPT:
483fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                raise ParseEscape
484fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if val == FILTER_SKIP:
485fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                # move all child nodes to the parent, and remove this node
486fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                parent = node.parentNode
487fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                for child in node.childNodes[:]:
488fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    parent.appendChild(child)
489fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                # node is handled by the caller
490fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                return FILTER_REJECT
491fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if val not in _ALLOWED_FILTER_RETURNS:
49270e79803fcc93e19808faa240a5f5e4854d0b077Collin Winter                raise ValueError(
49370e79803fcc93e19808faa240a5f5e4854d0b077Collin Winter                      "acceptNode() returned illegal value: " + repr(val))
494fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return val
495fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
496fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return FILTER_ACCEPT
497fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
498fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    _nodetype_mask = {
499fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
500fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
501fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
502fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
503fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
504fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
505fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
506fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
507fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
508fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
509fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
510fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
511fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        }
512fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
513fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
51449fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersclass FilterCrutch(object):
515fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    __slots__ = '_builder', '_level', '_old_start', '_old_end'
516fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
517fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __init__(self, builder):
518fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._level = 0
519fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._builder = builder
520fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser = builder._parser
521fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._old_start = parser.StartElementHandler
522fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._old_end = parser.EndElementHandler
523fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.StartElementHandler = self.start_element_handler
524fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.EndElementHandler = self.end_element_handler
525fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
526fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass Rejecter(FilterCrutch):
527fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    __slots__ = ()
528fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
529fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __init__(self, builder):
530fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        FilterCrutch.__init__(self, builder)
531fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser = builder._parser
532fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        for name in ("ProcessingInstructionHandler",
533fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                     "CommentHandler",
534fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                     "CharacterDataHandler",
535fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                     "StartCdataSectionHandler",
536fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                     "EndCdataSectionHandler",
537fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                     "ExternalEntityRefHandler",
538fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                     ):
539fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            setattr(parser, name, None)
540fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
541fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_element_handler(self, *args):
542fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._level = self._level + 1
543fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
544fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def end_element_handler(self, *args):
545fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._level == 0:
546fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # restore the old handlers
547fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser = self._builder._parser
548fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._builder.install(parser)
549fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.StartElementHandler = self._old_start
550fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.EndElementHandler = self._old_end
551fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
552fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._level = self._level - 1
553fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
554fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass Skipper(FilterCrutch):
555fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    __slots__ = ()
556fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
557fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_element_handler(self, *args):
558fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = self._builder.curNode
559fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._old_start(*args)
560fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._builder.curNode is not node:
561fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._level = self._level + 1
562fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
563fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def end_element_handler(self, *args):
564fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._level == 0:
565fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # We're popping back out of the node we're skipping, so we
566fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # shouldn't need to do anything but reset the handlers.
567fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._builder._parser.StartElementHandler = self._old_start
568fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._builder._parser.EndElementHandler = self._old_end
569fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._builder = None
570fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
571fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._level = self._level - 1
572fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._old_end(*args)
573fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
574fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
575fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# framework document used by the fragment builder.
576fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis# Takes a string for the doctype, subset string, and namespace attrs string.
577fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
578fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
579fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    "http://xml.python.org/entities/fragment-builder/internal"
580fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
581fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis_FRAGMENT_BUILDER_TEMPLATE = (
582fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    '''\
583fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis<!DOCTYPE wrapper
584fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis  %%s [
585fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis  <!ENTITY fragment-builder-internal
586fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    SYSTEM "%s">
587fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis%%s
588fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis]>
589fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis<wrapper %%s
590fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis>&fragment-builder-internal;</wrapper>'''
591fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
592fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
593fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
594fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass FragmentBuilder(ExpatBuilder):
595fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Builder which constructs document fragments given XML source
596fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    text and a context node.
597fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
598fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    The context node is expected to provide information about the
599fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    namespace declarations which are in scope at the start of the
600fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    fragment.
601fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """
602fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
603fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def __init__(self, context, options=None):
604fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if context.nodeType == DOCUMENT_NODE:
605fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.originalDocument = context
606fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.context = context
607fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
608fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.originalDocument = context.ownerDocument
609fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.context = context
610fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        ExpatBuilder.__init__(self, options)
611fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
612fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def reset(self):
613fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        ExpatBuilder.reset(self)
614fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.fragment = None
615fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
616fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def parseFile(self, file):
617fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Parse a document fragment from a file object, returning the
618fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        fragment node."""
619fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return self.parseString(file.read())
620fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
621fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def parseString(self, string):
622fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Parse a document fragment from a string, returning the
623fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        fragment node."""
624fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._source = string
625fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser = self.getParser()
626fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        doctype = self.originalDocument.doctype
627fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        ident = ""
628fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if doctype:
629fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            subset = doctype.internalSubset or self._getDeclarations()
630fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if doctype.publicId:
631fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                ident = ('PUBLIC "%s" "%s"'
632fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                         % (doctype.publicId, doctype.systemId))
633fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            elif doctype.systemId:
634fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                ident = 'SYSTEM "%s"' % doctype.systemId
635fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
636fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            subset = ""
637fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        nsattrs = self._getNSattrs() # get ns decls from node's ancestors
638fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
639fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        try:
640fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.Parse(document, 1)
641fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        except:
642fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.reset()
643fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            raise
644fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        fragment = self.fragment
645fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.reset()
646fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis##         self._parser = None
647fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return fragment
648fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
649fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _getDeclarations(self):
650fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Re-create the internal subset from the DocumentType node.
651fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
652fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        This is only needed if we don't already have the
653fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        internalSubset as a string.
654fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """
655fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        doctype = self.context.ownerDocument.doctype
656fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        s = ""
657fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if doctype:
658fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            for i in range(doctype.notations.length):
659fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                notation = doctype.notations.item(i)
660fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if s:
661fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = s + "\n  "
662fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                s = "%s<!NOTATION %s" % (s, notation.nodeName)
663fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if notation.publicId:
664fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = '%s PUBLIC "%s"\n             "%s">' \
665fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        % (s, notation.publicId, notation.systemId)
666fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                else:
667fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = '%s SYSTEM "%s">' % (s, notation.systemId)
668fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            for i in range(doctype.entities.length):
669fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                entity = doctype.entities.item(i)
670fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if s:
671fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = s + "\n  "
672fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                s = "%s<!ENTITY %s" % (s, entity.nodeName)
673fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if entity.publicId:
674fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = '%s PUBLIC "%s"\n             "%s"' \
675fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        % (s, entity.publicId, entity.systemId)
676fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                elif entity.systemId:
677fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = '%s SYSTEM "%s"' % (s, entity.systemId)
678fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                else:
679fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = '%s "%s"' % (s, entity.firstChild.data)
680fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if entity.notationName:
681fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    s = "%s NOTATION %s" % (s, entity.notationName)
682fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                s = s + ">"
683fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return s
684fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
685fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _getNSattrs(self):
686fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return ""
687fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
688fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def external_entity_ref_handler(self, context, base, systemId, publicId):
689fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
690fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # this entref is the one that we made to put the subtree
691fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # in; all of our given input is parsed in here.
692fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            old_document = self.document
693fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            old_cur_node = self.curNode
694fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser = self._parser.ExternalEntityParserCreate(context)
695fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            # put the real document back, parse into the fragment to return
696fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.document = self.originalDocument
697fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.fragment = self.document.createDocumentFragment()
698fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.curNode = self.fragment
699fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            try:
700fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                parser.Parse(self._source, 1)
701fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            finally:
702fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self.curNode = old_cur_node
703fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self.document = old_document
704fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self._source = None
705fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return -1
706fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
707fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            return ExpatBuilder.external_entity_ref_handler(
708fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self, context, base, systemId, publicId)
709fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
710fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
711fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass Namespaces:
712fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Mix-in class for builders; adds support for namespaces."""
713fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
714fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _initNamespaces(self):
715fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # list of (prefix, uri) ns declarations.  Namespace attrs are
716fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # constructed from this and added to the element's attrs.
717fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._ns_ordered_prefixes = []
718fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
719fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def createParser(self):
720fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Create a new namespace-handling parser."""
721fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser = expat.ParserCreate(namespace_separator=" ")
722fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.namespace_prefixes = True
723fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return parser
724fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
725fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def install(self, parser):
726fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Insert the namespace-handlers onto the parser."""
727fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        ExpatBuilder.install(self, parser)
728fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._options.namespace_declarations:
729fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.StartNamespaceDeclHandler = (
730fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                self.start_namespace_decl_handler)
731fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
732fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_namespace_decl_handler(self, prefix, uri):
733fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Push this namespace declaration on our storage."""
734fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._ns_ordered_prefixes.append((prefix, uri))
735fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
736fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_element_handler(self, name, attributes):
737fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if ' ' in name:
738fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            uri, localname, prefix, qname = _parse_ns_name(self, name)
739fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
740fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            uri = EMPTY_NAMESPACE
741fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            qname = name
742fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            localname = None
743fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            prefix = EMPTY_PREFIX
744fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node = minidom.Element(qname, uri, prefix, localname)
745fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        node.ownerDocument = self.document
746fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        _append_child(self.curNode, node)
747fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.curNode = node
748fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
749fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if self._ns_ordered_prefixes:
750fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            for prefix, uri in self._ns_ordered_prefixes:
751fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if prefix:
752fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
753fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                     XMLNS_NAMESPACE, prefix, "xmlns")
754fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                else:
755fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
756fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                     "xmlns", EMPTY_PREFIX)
75714aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis                a.value = uri
7586c75301eb6892170dcbea634b3be5cfa179c3d2eFlorent Xicluna                a.ownerDocument = self.document
759fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                _set_attribute_node(node, a)
760fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            del self._ns_ordered_prefixes[:]
761fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
762fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if attributes:
7637b77188e89182e0cf00d42f5556f972a739d6bebMartin v. Löwis            node._ensure_attributes()
764fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            _attrs = node._attrs
765fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            _attrsNS = node._attrsNS
766fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            for i in range(0, len(attributes), 2):
767fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                aname = attributes[i]
768fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                value = attributes[i+1]
769fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                if ' ' in aname:
770fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    uri, localname, prefix, qname = _parse_ns_name(self, aname)
771fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    a = minidom.Attr(qname, uri, localname, prefix)
772fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    _attrs[qname] = a
773fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    _attrsNS[(uri, localname)] = a
774fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                else:
775fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    a = minidom.Attr(aname, EMPTY_NAMESPACE,
776fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                     aname, EMPTY_PREFIX)
777fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    _attrs[aname] = a
778fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    _attrsNS[(EMPTY_NAMESPACE, aname)] = a
77914aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis                a.ownerDocument = self.document
78014aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis                a.value = value
78114aa280de23ecb6c31ffbde4a12c1a0a6f5493e0Martin v. Löwis                a.ownerElement = node
782fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
783fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if __debug__:
784fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # This only adds some asserts to the original
785fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # end_element_handler(), so we only define this when -O is not
786fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # used.  If changing one, be sure to check the other to see if
787fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # it needs to be changed as well.
788fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        #
789fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        def end_element_handler(self, name):
790fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            curNode = self.curNode
791fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if ' ' in name:
792fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                uri, localname, prefix, qname = _parse_ns_name(self, name)
793fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                assert (curNode.namespaceURI == uri
794fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        and curNode.localName == localname
795fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        and curNode.prefix == prefix), \
796fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        "element stack messed up! (namespace)"
797fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            else:
798fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                assert curNode.nodeName == name, \
799fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                       "element stack messed up - bad nodeName"
800fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                assert curNode.namespaceURI == EMPTY_NAMESPACE, \
801fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                       "element stack messed up - bad namespaceURI"
802fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.curNode = curNode.parentNode
803fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self._finish_end_element(curNode)
804fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
805fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
806fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass ExpatBuilderNS(Namespaces, ExpatBuilder):
807fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Document builder that supports namespaces."""
808fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
809fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def reset(self):
810fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        ExpatBuilder.reset(self)
811fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._initNamespaces()
812fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
813fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
814fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass FragmentBuilderNS(Namespaces, FragmentBuilder):
815fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Fragment builder that supports namespaces."""
816fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
817fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def reset(self):
818fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        FragmentBuilder.reset(self)
819fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self._initNamespaces()
820fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
821fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def _getNSattrs(self):
822fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Return string of namespace attributes from this element and
823fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        ancestors."""
824fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # XXX This needs to be re-written to walk the ancestors of the
825fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # context to build up the namespace information from
826fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # declarations, elements, and attributes found in context.
827fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # Otherwise we have to store a bunch more data on the DOM
828fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        # (though that *might* be more reliable -- not clear).
829fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        attrs = ""
830fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        context = self.context
831fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        L = []
832fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        while context:
833fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            if hasattr(context, '_ns_prefix_uri'):
834fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                for prefix, uri in context._ns_prefix_uri.items():
835fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    # add every new NS decl from context to L and attrs string
836fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    if prefix in L:
837fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        continue
838fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    L.append(prefix)
839fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    if prefix:
840fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        declname = "xmlns:" + prefix
841fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    else:
842fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        declname = "xmlns"
843fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    if attrs:
844fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
845fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                    else:
846fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                        attrs = " %s='%s'" % (declname, uri)
847fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            context = context.parentNode
848fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return attrs
849fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
850fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
851fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass ParseEscape(Exception):
852fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
853fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    pass
854fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
855fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisclass InternalSubsetExtractor(ExpatBuilder):
856fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """XML processor which can rip out the internal document type subset."""
857fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
858fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    subset = None
859fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
860fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def getSubset(self):
861fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        """Return the internal subset as a string."""
862fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return self.subset
863fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
864fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def parseFile(self, file):
865fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        try:
866fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            ExpatBuilder.parseFile(self, file)
867fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        except ParseEscape:
868fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            pass
869fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
870fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def parseString(self, string):
871fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        try:
872fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            ExpatBuilder.parseString(self, string)
873fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        except ParseEscape:
874fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            pass
875fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
876fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def install(self, parser):
877fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
878fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        parser.StartElementHandler = self.start_element_handler
879fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
880fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_doctype_decl_handler(self, name, publicId, systemId,
881fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis                                   has_internal_subset):
882fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        if has_internal_subset:
883fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser = self.getParser()
884fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            self.subset = []
885fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.DefaultHandler = self.subset.append
886fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
887fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        else:
888fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            raise ParseEscape()
889fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
890fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def end_doctype_decl_handler(self):
891fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
892fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        self.subset = s
893fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        raise ParseEscape()
894fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
895fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    def start_element_handler(self, name, attrs):
896fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        raise ParseEscape()
897fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
898fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
89949fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersdef parse(file, namespaces=True):
900fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Parse a document, returning the resulting Document node.
901fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
902fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    'file' may be either a file name or an open file object.
903fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """
904fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if namespaces:
905fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = ExpatBuilderNS()
906fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
907fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = ExpatBuilder()
908fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
909c9543e42330e5f339d6419eba6a8c5a61a39aecaChristian Heimes    if isinstance(file, str):
9102f50aaf2ff427fb713e82699a6dcbeeb038b10c2Giampaolo Rodola'        with open(file, 'rb') as fp:
911fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            result = builder.parseFile(fp)
912fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
913fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        result = builder.parseFile(file)
914fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    return result
915fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
916fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
91749fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersdef parseString(string, namespaces=True):
918fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Parse a document from a string, returning the resulting
919fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    Document node.
920fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """
921fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if namespaces:
922fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = ExpatBuilderNS()
923fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
924fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = ExpatBuilder()
925fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    return builder.parseString(string)
926fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
927fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
92849fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersdef parseFragment(file, context, namespaces=True):
929fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Parse a fragment of a document, given the context from which it
930fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    was originally extracted.  context should be the parent of the
931fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    node(s) which are in the fragment.
932fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
933fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    'file' may be either a file name or an open file object.
934fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """
935fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if namespaces:
936fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = FragmentBuilderNS(context)
937fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
938fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = FragmentBuilder(context)
939fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
940c9543e42330e5f339d6419eba6a8c5a61a39aecaChristian Heimes    if isinstance(file, str):
9412f50aaf2ff427fb713e82699a6dcbeeb038b10c2Giampaolo Rodola'        with open(file, 'rb') as fp:
942fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis            result = builder.parseFile(fp)
943fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
944fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        result = builder.parseFile(file)
945fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    return result
946fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
947fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
94849fd7fa4431da299196d74087df4a04f99f9c46fThomas Woutersdef parseFragmentString(string, context, namespaces=True):
949fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Parse a fragment of a document from a string, given the context
950fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    from which it was originally extracted.  context should be the
951fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    parent of the node(s) which are in the fragment.
952fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """
953fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if namespaces:
954fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = FragmentBuilderNS(context)
955fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
956fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        builder = FragmentBuilder(context)
957fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    return builder.parseString(string)
958fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
959fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis
960fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwisdef makeBuilder(options):
961fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    """Create a builder based on an Options object."""
962fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    if options.namespaces:
963fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return ExpatBuilderNS(options)
964fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis    else:
965fc5fec77350c1231f55ac8facb1ec3ce0d635a4dMartin v. Löwis        return ExpatBuilder(options)
966