xml/parsers/DocumentBuilderImpl.java

/*
 * Copyright (C) 2007 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.harmony.xml.parsers;

import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import javax.xml.parsers.DocumentBuilder;
import libcore.io.IoUtils;
import org.apache.harmony.xml.dom.CDATASectionImpl;
import org.apache.harmony.xml.dom.DOMImplementationImpl;
import org.apache.harmony.xml.dom.DocumentImpl;
import org.apache.harmony.xml.dom.DocumentTypeImpl;
import org.apache.harmony.xml.dom.TextImpl;
import org.kxml2.io.KXmlParser;
import org.w3c.dom.Attr;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.LocatorImpl;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;

/**
 * Builds a DOM using KXmlParser.
 */
class DocumentBuilderImpl extends DocumentBuilder {

    private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance();

    private boolean coalescing;
    private EntityResolver entityResolver;
    private ErrorHandler errorHandler;
    private boolean ignoreComments;
    private boolean ignoreElementContentWhitespace;
    private boolean namespaceAware;
    // adding a new field? don't forget to update reset().

    @Override public void reset() {
        coalescing = false;
        entityResolver = null;
        errorHandler = null;
        ignoreComments = false;
        ignoreElementContentWhitespace = false;
        namespaceAware = false;
    }

    @Override
    public DOMImplementation getDOMImplementation() {
        return dom;
    }

    @Override
    public boolean isNamespaceAware() {
        return namespaceAware;
    }

    @Override
    public boolean isValidating() {
        return false;
    }

    @Override
    public Document newDocument() {
        return dom.createDocument(null, null, null);
    }

    @Override
    public Document parse(InputSource source) throws SAXException, IOException {
        if (source == null) {
            throw new IllegalArgumentException("source == null");
        }

        String namespaceURI = null;
        String qualifiedName = null;
        DocumentType doctype = null;
        String inputEncoding = source.getEncoding();
        String systemId = source.getSystemId();
        DocumentImpl document = new DocumentImpl(
                dom, namespaceURI, qualifiedName, doctype, inputEncoding);
        document.setDocumentURI(systemId);

        KXmlParser parser = new KXmlParser();
        try {
            parser.keepNamespaceAttributes();
            parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware);

            if (source.getByteStream() != null) {
                parser.setInput(source.getByteStream(), inputEncoding);
            } else if (source.getCharacterStream() != null) {
                parser.setInput(source.getCharacterStream());
            } else if (systemId != null) {
                URL url = new URL(systemId);
                URLConnection urlConnection = url.openConnection();
                urlConnection.connect();
                // TODO: if null, extract the inputEncoding from the Content-Type header?
                parser.setInput(urlConnection.getInputStream(), inputEncoding);
            } else {
                throw new SAXParseException("InputSource needs a stream, reader or URI", null);
            }

            if (parser.nextToken() == XmlPullParser.END_DOCUMENT) {
                throw new SAXParseException("Unexpected end of document", null);
            }

            parse(parser, document, document, XmlPullParser.END_DOCUMENT);

            parser.require(XmlPullParser.END_DOCUMENT, null, null);
        } catch (XmlPullParserException ex) {
            if (ex.getDetail() instanceof IOException) {
                throw (IOException) ex.getDetail();
            }
            if (ex.getDetail() instanceof RuntimeException) {
                throw (RuntimeException) ex.getDetail();
            }

            LocatorImpl locator = new LocatorImpl();

            locator.setPublicId(source.getPublicId());
            locator.setSystemId(systemId);
            locator.setLineNumber(ex.getLineNumber());
            locator.setColumnNumber(ex.getColumnNumber());

            SAXParseException newEx = new SAXParseException(ex.getMessage(), locator);

            if (errorHandler != null) {
                errorHandler.error(newEx);
            }

            throw newEx;
        } finally {
            IoUtils.closeQuietly(parser);
        }

        return document;
    }

    /**
     * Implements the whole parsing of the XML document. The XML pull parser is
     * actually more of a tokenizer, and we are doing a classical recursive
     * descent parsing (the method invokes itself for XML elements). Our
     * approach to parsing does accept some illegal documents (more than one
     * root element, for example). The assumption is that the DOM implementation
     * throws the proper exceptions in these cases.
     *
     * @param parser The XML pull parser we're reading from.
     * @param document The document we're building.
     * @param node The node we're currently on (initially the document itself).
     * @param endToken The token that will end this recursive call. Either
     *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
     *
     * @throws XmlPullParserException If a parsing error occurs.
     * @throws IOException If a general IO error occurs.
     */
    private void parse(KXmlParser parser, DocumentImpl document, Node node,
            int endToken) throws XmlPullParserException, IOException {

        int token = parser.getEventType();

        /*
         * The main parsing loop. The precondition is that we are already on the
         * token to be processed. This holds for each iteration of the loop, so
         * the inner statements have to ensure that (in particular the recursive
         * call).
         */
        while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
            if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
                /*
                 * Found a processing instructions. We need to split the token
                 * text at the first whitespace character.
                 */
                String text = parser.getText();

                int dot = text.indexOf(' ');

                String target = (dot != -1 ? text.substring(0, dot) : text);
                String data = (dot != -1 ? text.substring(dot + 1) : "");

                node.appendChild(document.createProcessingInstruction(target,
                        data));
            } else if (token == XmlPullParser.DOCDECL) {
                String name = parser.getRootElementName();
                String publicId = parser.getPublicId();
                String systemId = parser.getSystemId();
                document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId));

            } else if (token == XmlPullParser.COMMENT) {
                /*
                 * Found a comment. We simply take the token text, but we only
                 * create a node if the client wants to see comments at all.
                 */
                if (!ignoreComments) {
                    node.appendChild(document.createComment(parser.getText()));
                }
            } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
                /*
                 * Found some ignorable whitespace. We only add it if the client
                 * wants to see whitespace. Whitespace before and after the
                 * document element is always ignored.
                 */
                if (!ignoreElementContentWhitespace && document != node) {
                    appendText(document, node, token, parser.getText());
                }
            } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
                /*
                 * Found a piece of text (possibly encoded as a CDATA section).
                 * That's the easiest case. We simply take it and create a new text node,
                 * or merge with an adjacent text node.
                 */
                appendText(document, node, token, parser.getText());
            } else if (token == XmlPullParser.ENTITY_REF) {
                /*
                 * Found an entity reference. If an entity resolver is
                 * installed, we replace it by text (if possible). Otherwise we
                 * add an entity reference node.
                 */
                String entity = parser.getName();

                if (entityResolver != null) {
                    // TODO Implement this...
                }

                String resolved = resolvePredefinedOrCharacterEntity(entity);
                if (resolved != null) {
                    appendText(document, node, token, resolved);
                } else {
                    node.appendChild(document.createEntityReference(entity));
                }
            } else if (token == XmlPullParser.START_TAG) {
                /*
                 * Found an element start tag. We create an element node with
                 * the proper info and attributes. We then invoke parse()
                 * recursively to handle the next level of nesting. When we
                 * return from this call, we check that we are on the proper
                 * element end tag. The whole handling differs somewhat
                 * depending on whether the parser is namespace-aware or not.
                 */
                if (namespaceAware) {
                    // Collect info for element node
                    String namespace = parser.getNamespace();
                    String name = parser.getName();
                    String prefix = parser.getPrefix();

                    if ("".equals(namespace)) {
                        namespace = null;
                    }

                    // Create element node and wire it correctly
                    Element element = document.createElementNS(namespace, name);
                    element.setPrefix(prefix);
                    node.appendChild(element);

                    for (int i = 0; i < parser.getAttributeCount(); i++) {
                        // Collect info for a single attribute node
                        String attrNamespace = parser.getAttributeNamespace(i);
                        String attrPrefix = parser.getAttributePrefix(i);
                        String attrName = parser.getAttributeName(i);
                        String attrValue = parser.getAttributeValue(i);

                        if ("".equals(attrNamespace)) {
                            attrNamespace = null;
                        }

                        // Create attribute node and wire it correctly
                        Attr attr = document.createAttributeNS(attrNamespace, attrName);
                        attr.setPrefix(attrPrefix);
                        attr.setValue(attrValue);
                        element.setAttributeNodeNS(attr);
                    }

                    // Recursive descent
                    token = parser.nextToken();
                    parse(parser, document, element, XmlPullParser.END_TAG);

                    // Expect the element's end tag here
                    parser.require(XmlPullParser.END_TAG, namespace, name);

                } else {
                    // Collect info for element node
                    String name = parser.getName();

                    // Create element node and wire it correctly
                    Element element = document.createElement(name);
                    node.appendChild(element);

                    for (int i = 0; i < parser.getAttributeCount(); i++) {
                        // Collect info for a single attribute node
                        String attrName = parser.getAttributeName(i);
                        String attrValue = parser.getAttributeValue(i);

                        // Create attribute node and wire it correctly
                        Attr attr = document.createAttribute(attrName);
                        attr.setValue(attrValue);
                        element.setAttributeNode(attr);
                    }

                    // Recursive descent
                    token = parser.nextToken();
                    parse(parser, document, element, XmlPullParser.END_TAG);

                    // Expect the element's end tag here
                    parser.require(XmlPullParser.END_TAG, "", name);
                }
            }

            token = parser.nextToken();
        }
    }

    /**
     * @param token the XML pull parser token type, such as XmlPullParser.CDSECT
     *      or XmlPullParser.ENTITY_REF.
     */
    private void appendText(DocumentImpl document, Node parent, int token, String text) {
        // Ignore empty runs.
        if (text.isEmpty()) {
            return;
        }
        // Merge with any previous text node if possible.
        if (coalescing || token != XmlPullParser.CDSECT) {
            Node lastChild = parent.getLastChild();
            if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
                Text textNode = (Text) lastChild;
                textNode.appendData(text);
                return;
            }
        }
        // Okay, we really do need a new text node
        parent.appendChild(token == XmlPullParser.CDSECT
                ? new CDATASectionImpl(document, text)
                : new TextImpl(document, text));
    }

    @Override
    public void setEntityResolver(EntityResolver resolver) {
        entityResolver = resolver;
    }

    @Override
    public void setErrorHandler(ErrorHandler handler) {
        errorHandler = handler;
    }

    /**
     * Controls whether this DocumentBuilder ignores comments.
     */
    public void setIgnoreComments(boolean value) {
        ignoreComments = value;
    }

    public void setCoalescing(boolean value) {
        coalescing = value;
    }

    /**
     * Controls whether this DocumentBuilder ignores element content whitespace.
     */
    public void setIgnoreElementContentWhitespace(boolean value) {
        ignoreElementContentWhitespace = value;
    }

    /**
     * Controls whether this DocumentBuilder is namespace-aware.
     */
    public void setNamespaceAware(boolean value) {
        namespaceAware = value;
    }

    /**
     * Returns the replacement text or null if {@code entity} isn't predefined.
     */
    private String resolvePredefinedOrCharacterEntity(String entityName) {
        // Character references, section 4.1 of the XML specification.
        if (entityName.startsWith("#x")) {
            return resolveCharacterReference(entityName.substring(2), 16);
        } else if (entityName.startsWith("#")) {
            return resolveCharacterReference(entityName.substring(1), 10);
        }
        // Predefined entities, section 4.6 of the XML specification.
        if ("lt".equals(entityName)) {
            return "<";
        } else if ("gt".equals(entityName)) {
            return ">";
        } else if ("amp".equals(entityName)) {
            return "&";
        } else if ("apos".equals(entityName)) {
            return "'";
        } else if ("quot".equals(entityName)) {
            return "\"";
        } else {
            return null;
        }
    }

    private String resolveCharacterReference(String value, int base) {
        try {
            int codePoint = Integer.parseInt(value, base);
            if (Character.isBmpCodePoint(codePoint)) {
                return String.valueOf((char) codePoint);
            } else {
                char[] surrogatePair = Character.toChars(codePoint);
                return new String(surrogatePair);
            }
        } catch (NumberFormatException ex) {
            return null;
        }
    }
}