1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package org.apache.harmony.xml.parsers;
18
19import java.io.IOException;
20import java.net.URL;
21import java.net.URLConnection;
22import javax.xml.parsers.DocumentBuilder;
23import libcore.io.IoUtils;
24import org.apache.harmony.xml.dom.CDATASectionImpl;
25import org.apache.harmony.xml.dom.DOMImplementationImpl;
26import org.apache.harmony.xml.dom.DocumentImpl;
27import org.apache.harmony.xml.dom.DocumentTypeImpl;
28import org.apache.harmony.xml.dom.TextImpl;
29import org.kxml2.io.KXmlParser;
30import org.w3c.dom.Attr;
31import org.w3c.dom.DOMImplementation;
32import org.w3c.dom.Document;
33import org.w3c.dom.DocumentType;
34import org.w3c.dom.Element;
35import org.w3c.dom.Node;
36import org.w3c.dom.Text;
37import org.xml.sax.EntityResolver;
38import org.xml.sax.ErrorHandler;
39import org.xml.sax.InputSource;
40import org.xml.sax.SAXException;
41import org.xml.sax.SAXParseException;
42import org.xml.sax.helpers.LocatorImpl;
43import org.xmlpull.v1.XmlPullParser;
44import org.xmlpull.v1.XmlPullParserException;
45
46/**
47 * Builds a DOM using KXmlParser.
48 */
49class DocumentBuilderImpl extends DocumentBuilder {
50
51    private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance();
52
53    private boolean coalescing;
54    private EntityResolver entityResolver;
55    private ErrorHandler errorHandler;
56    private boolean ignoreComments;
57    private boolean ignoreElementContentWhitespace;
58    private boolean namespaceAware;
59    // adding a new field? don't forget to update reset().
60
61    @Override public void reset() {
62        coalescing = false;
63        entityResolver = null;
64        errorHandler = null;
65        ignoreComments = false;
66        ignoreElementContentWhitespace = false;
67        namespaceAware = false;
68    }
69
70    @Override
71    public DOMImplementation getDOMImplementation() {
72        return dom;
73    }
74
75    @Override
76    public boolean isNamespaceAware() {
77        return namespaceAware;
78    }
79
80    @Override
81    public boolean isValidating() {
82        return false;
83    }
84
85    @Override
86    public Document newDocument() {
87        return dom.createDocument(null, null, null);
88    }
89
90    @Override
91    public Document parse(InputSource source) throws SAXException, IOException {
92        if (source == null) {
93            throw new IllegalArgumentException("source == null");
94        }
95
96        String namespaceURI = null;
97        String qualifiedName = null;
98        DocumentType doctype = null;
99        String inputEncoding = source.getEncoding();
100        String systemId = source.getSystemId();
101        DocumentImpl document = new DocumentImpl(
102                dom, namespaceURI, qualifiedName, doctype, inputEncoding);
103        document.setDocumentURI(systemId);
104
105        KXmlParser parser = new KXmlParser();
106        try {
107            parser.keepNamespaceAttributes();
108            parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware);
109
110            if (source.getByteStream() != null) {
111                parser.setInput(source.getByteStream(), inputEncoding);
112            } else if (source.getCharacterStream() != null) {
113                parser.setInput(source.getCharacterStream());
114            } else if (systemId != null) {
115                URL url = new URL(systemId);
116                URLConnection urlConnection = url.openConnection();
117                urlConnection.connect();
118                // TODO: if null, extract the inputEncoding from the Content-Type header?
119                parser.setInput(urlConnection.getInputStream(), inputEncoding);
120            } else {
121                throw new SAXParseException("InputSource needs a stream, reader or URI", null);
122            }
123
124            if (parser.nextToken() == XmlPullParser.END_DOCUMENT) {
125                throw new SAXParseException("Unexpected end of document", null);
126            }
127
128            parse(parser, document, document, XmlPullParser.END_DOCUMENT);
129
130            parser.require(XmlPullParser.END_DOCUMENT, null, null);
131        } catch (XmlPullParserException ex) {
132            if (ex.getDetail() instanceof IOException) {
133                throw (IOException) ex.getDetail();
134            }
135            if (ex.getDetail() instanceof RuntimeException) {
136                throw (RuntimeException) ex.getDetail();
137            }
138
139            LocatorImpl locator = new LocatorImpl();
140
141            locator.setPublicId(source.getPublicId());
142            locator.setSystemId(systemId);
143            locator.setLineNumber(ex.getLineNumber());
144            locator.setColumnNumber(ex.getColumnNumber());
145
146            SAXParseException newEx = new SAXParseException(ex.getMessage(), locator);
147
148            if (errorHandler != null) {
149                errorHandler.error(newEx);
150            }
151
152            throw newEx;
153        } finally {
154            IoUtils.closeQuietly(parser);
155        }
156
157        return document;
158    }
159
160    /**
161     * Implements the whole parsing of the XML document. The XML pull parser is
162     * actually more of a tokenizer, and we are doing a classical recursive
163     * descent parsing (the method invokes itself for XML elements). Our
164     * approach to parsing does accept some illegal documents (more than one
165     * root element, for example). The assumption is that the DOM implementation
166     * throws the proper exceptions in these cases.
167     *
168     * @param parser The XML pull parser we're reading from.
169     * @param document The document we're building.
170     * @param node The node we're currently on (initially the document itself).
171     * @param endToken The token that will end this recursive call. Either
172     *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
173     *
174     * @throws XmlPullParserException If a parsing error occurs.
175     * @throws IOException If a general IO error occurs.
176     */
177    private void parse(KXmlParser parser, DocumentImpl document, Node node,
178            int endToken) throws XmlPullParserException, IOException {
179
180        int token = parser.getEventType();
181
182        /*
183         * The main parsing loop. The precondition is that we are already on the
184         * token to be processed. This holds for each iteration of the loop, so
185         * the inner statements have to ensure that (in particular the recursive
186         * call).
187         */
188        while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
189            if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
190                /*
191                 * Found a processing instructions. We need to split the token
192                 * text at the first whitespace character.
193                 */
194                String text = parser.getText();
195
196                int dot = text.indexOf(' ');
197
198                String target = (dot != -1 ? text.substring(0, dot) : text);
199                String data = (dot != -1 ? text.substring(dot + 1) : "");
200
201                node.appendChild(document.createProcessingInstruction(target,
202                        data));
203            } else if (token == XmlPullParser.DOCDECL) {
204                String name = parser.getRootElementName();
205                String publicId = parser.getPublicId();
206                String systemId = parser.getSystemId();
207                document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId));
208
209            } else if (token == XmlPullParser.COMMENT) {
210                /*
211                 * Found a comment. We simply take the token text, but we only
212                 * create a node if the client wants to see comments at all.
213                 */
214                if (!ignoreComments) {
215                    node.appendChild(document.createComment(parser.getText()));
216                }
217            } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
218                /*
219                 * Found some ignorable whitespace. We only add it if the client
220                 * wants to see whitespace. Whitespace before and after the
221                 * document element is always ignored.
222                 */
223                if (!ignoreElementContentWhitespace && document != node) {
224                    appendText(document, node, token, parser.getText());
225                }
226            } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
227                /*
228                 * Found a piece of text (possibly encoded as a CDATA section).
229                 * That's the easiest case. We simply take it and create a new text node,
230                 * or merge with an adjacent text node.
231                 */
232                appendText(document, node, token, parser.getText());
233            } else if (token == XmlPullParser.ENTITY_REF) {
234                /*
235                 * Found an entity reference. If an entity resolver is
236                 * installed, we replace it by text (if possible). Otherwise we
237                 * add an entity reference node.
238                 */
239                String entity = parser.getName();
240
241                if (entityResolver != null) {
242                    // TODO Implement this...
243                }
244
245                String resolved = resolvePredefinedOrCharacterEntity(entity);
246                if (resolved != null) {
247                    appendText(document, node, token, resolved);
248                } else {
249                    node.appendChild(document.createEntityReference(entity));
250                }
251            } else if (token == XmlPullParser.START_TAG) {
252                /*
253                 * Found an element start tag. We create an element node with
254                 * the proper info and attributes. We then invoke parse()
255                 * recursively to handle the next level of nesting. When we
256                 * return from this call, we check that we are on the proper
257                 * element end tag. The whole handling differs somewhat
258                 * depending on whether the parser is namespace-aware or not.
259                 */
260                if (namespaceAware) {
261                    // Collect info for element node
262                    String namespace = parser.getNamespace();
263                    String name = parser.getName();
264                    String prefix = parser.getPrefix();
265
266                    if ("".equals(namespace)) {
267                        namespace = null;
268                    }
269
270                    // Create element node and wire it correctly
271                    Element element = document.createElementNS(namespace, name);
272                    element.setPrefix(prefix);
273                    node.appendChild(element);
274
275                    for (int i = 0; i < parser.getAttributeCount(); i++) {
276                        // Collect info for a single attribute node
277                        String attrNamespace = parser.getAttributeNamespace(i);
278                        String attrPrefix = parser.getAttributePrefix(i);
279                        String attrName = parser.getAttributeName(i);
280                        String attrValue = parser.getAttributeValue(i);
281
282                        if ("".equals(attrNamespace)) {
283                            attrNamespace = null;
284                        }
285
286                        // Create attribute node and wire it correctly
287                        Attr attr = document.createAttributeNS(attrNamespace, attrName);
288                        attr.setPrefix(attrPrefix);
289                        attr.setValue(attrValue);
290                        element.setAttributeNodeNS(attr);
291                    }
292
293                    // Recursive descent
294                    token = parser.nextToken();
295                    parse(parser, document, element, XmlPullParser.END_TAG);
296
297                    // Expect the element's end tag here
298                    parser.require(XmlPullParser.END_TAG, namespace, name);
299
300                } else {
301                    // Collect info for element node
302                    String name = parser.getName();
303
304                    // Create element node and wire it correctly
305                    Element element = document.createElement(name);
306                    node.appendChild(element);
307
308                    for (int i = 0; i < parser.getAttributeCount(); i++) {
309                        // Collect info for a single attribute node
310                        String attrName = parser.getAttributeName(i);
311                        String attrValue = parser.getAttributeValue(i);
312
313                        // Create attribute node and wire it correctly
314                        Attr attr = document.createAttribute(attrName);
315                        attr.setValue(attrValue);
316                        element.setAttributeNode(attr);
317                    }
318
319                    // Recursive descent
320                    token = parser.nextToken();
321                    parse(parser, document, element, XmlPullParser.END_TAG);
322
323                    // Expect the element's end tag here
324                    parser.require(XmlPullParser.END_TAG, "", name);
325                }
326            }
327
328            token = parser.nextToken();
329        }
330    }
331
332    /**
333     * @param token the XML pull parser token type, such as XmlPullParser.CDSECT
334     *      or XmlPullParser.ENTITY_REF.
335     */
336    private void appendText(DocumentImpl document, Node parent, int token, String text) {
337        // Ignore empty runs.
338        if (text.isEmpty()) {
339            return;
340        }
341        // Merge with any previous text node if possible.
342        if (coalescing || token != XmlPullParser.CDSECT) {
343            Node lastChild = parent.getLastChild();
344            if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
345                Text textNode = (Text) lastChild;
346                textNode.appendData(text);
347                return;
348            }
349        }
350        // Okay, we really do need a new text node
351        parent.appendChild(token == XmlPullParser.CDSECT
352                ? new CDATASectionImpl(document, text)
353                : new TextImpl(document, text));
354    }
355
356    @Override
357    public void setEntityResolver(EntityResolver resolver) {
358        entityResolver = resolver;
359    }
360
361    @Override
362    public void setErrorHandler(ErrorHandler handler) {
363        errorHandler = handler;
364    }
365
366    /**
367     * Controls whether this DocumentBuilder ignores comments.
368     */
369    public void setIgnoreComments(boolean value) {
370        ignoreComments = value;
371    }
372
373    public void setCoalescing(boolean value) {
374        coalescing = value;
375    }
376
377    /**
378     * Controls whether this DocumentBuilder ignores element content whitespace.
379     */
380    public void setIgnoreElementContentWhitespace(boolean value) {
381        ignoreElementContentWhitespace = value;
382    }
383
384    /**
385     * Controls whether this DocumentBuilder is namespace-aware.
386     */
387    public void setNamespaceAware(boolean value) {
388        namespaceAware = value;
389    }
390
391    /**
392     * Returns the replacement text or null if {@code entity} isn't predefined.
393     */
394    private String resolvePredefinedOrCharacterEntity(String entityName) {
395        // Character references, section 4.1 of the XML specification.
396        if (entityName.startsWith("#x")) {
397            return resolveCharacterReference(entityName.substring(2), 16);
398        } else if (entityName.startsWith("#")) {
399            return resolveCharacterReference(entityName.substring(1), 10);
400        }
401        // Predefined entities, section 4.6 of the XML specification.
402        if ("lt".equals(entityName)) {
403            return "<";
404        } else if ("gt".equals(entityName)) {
405            return ">";
406        } else if ("amp".equals(entityName)) {
407            return "&";
408        } else if ("apos".equals(entityName)) {
409            return "'";
410        } else if ("quot".equals(entityName)) {
411            return "\"";
412        } else {
413            return null;
414        }
415    }
416
417    private String resolveCharacterReference(String value, int base) {
418        try {
419            int codePoint = Integer.parseInt(value, base);
420            if (Character.isBmpCodePoint(codePoint)) {
421                return String.valueOf((char) codePoint);
422            } else {
423                char[] surrogatePair = Character.toChars(codePoint);
424                return new String(surrogatePair);
425            }
426        } catch (NumberFormatException ex) {
427            return null;
428        }
429    }
430}
431