xml/serializer/EncodingInfo.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the  "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
 */
package org.apache.xml.serializer;


/**
 * Holds information about a given encoding, which is the Java name for the
 * encoding, the equivalent ISO name.
 * <p>
 * An object of this type has two useful methods
 * <pre>
 * isInEncoding(char ch);
 * </pre>
 * which can be called if the character is not the high one in
 * a surrogate pair and:
 * <pre>
 * isInEncoding(char high, char low);
 * </pre>
 * which can be called if the two characters from a high/low surrogate pair.
 * <p>
 * An EncodingInfo object is a node in a binary search tree. Such a node
 * will answer if a character is in the encoding, and do so for a given
 * range of unicode values (<code>m_first</code> to
 * <code>m_last</code>). It will handle a certain range of values
 * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
 * If the unicode point is before that explicit range, that is it
 * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
 * of such a tree, m_before.  Likewise for values in the range
 * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
 * <p>
 * Actually figuring out if a code point is in the encoding is expensive. So the
 * purpose of this tree is to cache such determinations, and not to build the
 * entire tree of information at the start, but only build up as much of the
 * tree as is used during the transformation.
 * <p>
 * This Class is not a public API, and should only be used internally within
 * the serializer.
 * <p>
 * This class is not a public API.
 * @xsl.usage internal
 */
public final class EncodingInfo extends Object
{

    /**
     * Not all characters in an encoding are in on contiguous group,
     * however there is a lowest contiguous group starting at '\u0001'
     * and working up to m_highCharInContiguousGroup.
     * <p>
     * This is the char for which chars at or below this value are
     * definately in the encoding, although for chars
     * above this point they might be in the encoding.
     * This exists for performance, especially for ASCII characters
     * because for ASCII all chars in the range '\u0001' to '\u007F'
     * are in the encoding.
     *
     */
    private final char m_highCharInContiguousGroup;

    /**
     * The ISO encoding name.
     */
    final String name;

    /**
     * The name used by the Java convertor.
     */
    final String javaName;

    /**
     * A helper object that we can ask if a
     * single char, or a surrogate UTF-16 pair
     * of chars that form a single character,
     * is in this encoding.
     */
    private InEncoding m_encoding;

    /**
     * This is not a public API. It returns true if the
     * char in question is in the encoding.
     * @param ch the char in question.
     * <p>
     * This method is not a public API.
     * @xsl.usage internal
     */
    public boolean isInEncoding(char ch) {
        if (m_encoding == null) {
            m_encoding = new EncodingImpl();

            // One could put alternate logic in here to
            // instantiate another object that implements the
            // InEncoding interface. For example if the JRE is 1.4 or up
            // we could have an object that uses JRE 1.4 methods
        }
        return m_encoding.isInEncoding(ch);
    }

    /**
     * This is not a public API. It returns true if the
     * character formed by the high/low pair is in the encoding.
     * @param high a char that the a high char of a high/low surrogate pair.
     * @param low a char that is the low char of a high/low surrogate pair.
     * <p>
     * This method is not a public API.
     * @xsl.usage internal
     */
    public boolean isInEncoding(char high, char low) {
        if (m_encoding == null) {
            m_encoding = new EncodingImpl();

            // One could put alternate logic in here to
            // instantiate another object that implements the
            // InEncoding interface. For example if the JRE is 1.4 or up
            // we could have an object that uses JRE 1.4 methods
        }
        return m_encoding.isInEncoding(high, low);
    }

    /**
     * Create an EncodingInfo object based on the ISO name and Java name.
     * If both parameters are null any character will be considered to
     * be in the encoding. This is useful for when the serializer is in
     * temporary output state, and has no assciated encoding.
     *
     * @param name reference to the ISO name.
     * @param javaName reference to the Java encoding name.
     * @param highChar The char for which characters at or below this value are
     * definately in the
     * encoding, although for characters above this point they might be in the encoding.
     */
    public EncodingInfo(String name, String javaName, char highChar)
    {

        this.name = name;
        this.javaName = javaName;
        this.m_highCharInContiguousGroup = highChar;
    }


    /**
     * A simple interface to isolate the implementation.
     * We could also use some new JRE 1.4 methods in another implementation
     * provided we use reflection with them.
     * <p>
     * This interface is not a public API,
     * and should only be used internally within the serializer.
     * @xsl.usage internal
     */
    private interface InEncoding {
        /**
         * Returns true if the char is in the encoding
         */
        public boolean isInEncoding(char ch);
        /**
         * Returns true if the high/low surrogate pair forms
         * a character that is in the encoding.
         */
        public boolean isInEncoding(char high, char low);
    }

    /**
     * This class implements the
     */
    private class EncodingImpl implements InEncoding {


        public boolean isInEncoding(char ch1) {
            final boolean ret;
            int codePoint = Encodings.toCodePoint(ch1);
            if (codePoint < m_explFirst) {
                // The unicode value is before the range
                // that we explictly manage, so we delegate the answer.

                // If we don't have an m_before object to delegate to, make one.
                if (m_before == null)
                    m_before =
                        new EncodingImpl(
                            m_encoding,
                            m_first,
                            m_explFirst - 1,
                            codePoint);
                ret = m_before.isInEncoding(ch1);
            } else if (m_explLast < codePoint) {
                // The unicode value is after the range
                // that we explictly manage, so we delegate the answer.

                // If we don't have an m_after object to delegate to, make one.
                if (m_after == null)
                    m_after =
                        new EncodingImpl(
                            m_encoding,
                            m_explLast + 1,
                            m_last,
                            codePoint);
                ret = m_after.isInEncoding(ch1);
            } else {
                // The unicode value is in the range we explitly handle
                final int idx = codePoint - m_explFirst;

                // If we already know the answer, just return it.
                if (m_alreadyKnown[idx])
                    ret = m_isInEncoding[idx];
                else {
                    // We don't know the answer, so find out,
                    // which may be expensive, then cache the answer
                    ret = inEncoding(ch1, m_encoding);
                    m_alreadyKnown[idx] = true;
                    m_isInEncoding[idx] = ret;
                }
            }
            return ret;
        }

        public boolean isInEncoding(char high, char low) {
            final boolean ret;
            int codePoint = Encodings.toCodePoint(high,low);
            if (codePoint < m_explFirst) {
                // The unicode value is before the range
                // that we explictly manage, so we delegate the answer.

                // If we don't have an m_before object to delegate to, make one.
                if (m_before == null)
                    m_before =
                        new EncodingImpl(
                            m_encoding,
                            m_first,
                            m_explFirst - 1,
                            codePoint);
                ret = m_before.isInEncoding(high,low);
            } else if (m_explLast < codePoint) {
                // The unicode value is after the range
                // that we explictly manage, so we delegate the answer.

                // If we don't have an m_after object to delegate to, make one.
                if (m_after == null)
                    m_after =
                        new EncodingImpl(
                            m_encoding,
                            m_explLast + 1,
                            m_last,
                            codePoint);
                ret = m_after.isInEncoding(high,low);
            } else {
                // The unicode value is in the range we explitly handle
                final int idx = codePoint - m_explFirst;

                // If we already know the answer, just return it.
                if (m_alreadyKnown[idx])
                    ret = m_isInEncoding[idx];
                else {
                    // We don't know the answer, so find out,
                    // which may be expensive, then cache the answer
                    ret = inEncoding(high, low, m_encoding);
                    m_alreadyKnown[idx] = true;
                    m_isInEncoding[idx] = ret;
                }
            }
            return ret;
        }

        /**
         * The encoding.
         */
        final private String m_encoding;
        /**
         * m_first through m_last is the range of unicode
         * values that this object will return an answer on.
         * It may delegate to a similar object with a different
         * range
         */
        final private int m_first;

        /**
         * m_explFirst through m_explLast is the range of unicode
         * value that this object handles explicitly and does not
         * delegate to a similar object.
         */
        final private int m_explFirst;
        final private int m_explLast;
        final private int m_last;

        /**
         * The object, of the same type as this one,
         * that handles unicode values in a range before
         * the range explictly handled by this object, and
         * to which this object may delegate.
         */
        private InEncoding m_before;
        /**
         * The object, of the same type as this one,
         * that handles unicode values in a range after
         * the range explictly handled by this object, and
         * to which this object may delegate.
         */
        private InEncoding m_after;

        /**
         * The number of unicode values explicitly handled
         * by a single EncodingInfo object. This value is
         * tuneable, but is set to 128 because that covers the
         * entire low range of ASCII type chars within a single
         * object.
         */
        private static final int RANGE = 128;

        /**
         * A flag to record if we already know the answer
         * for the given unicode value.
         */
        final private boolean m_alreadyKnown[] = new boolean[RANGE];
        /**
         * A table holding the answer on whether the given unicode
         * value is in the encoding.
         */
        final private boolean m_isInEncoding[] = new boolean[RANGE];

        private EncodingImpl() {
            // This object will answer whether any unicode value
            // is in the encoding, it handles values 0 through Integer.MAX_VALUE
            this(javaName, 0, Integer.MAX_VALUE, (char) 0);
        }

        private EncodingImpl(String encoding, int first, int last, int codePoint) {
            // Set the range of unicode values that this object manages
            // either explicitly or implicitly.
            m_first = first;
            m_last = last;

            // Set the range of unicode values that this object
            // explicitly manages
            m_explFirst = codePoint;
            m_explLast = codePoint + (RANGE-1);

            m_encoding = encoding;

            if (javaName != null)
            {
                // Some optimization.
                if (0 <= m_explFirst && m_explFirst <= 127) {
                    // This particular EncodingImpl explicitly handles
                    // characters in the low range.
                    if ("UTF8".equals(javaName)
                        || "UTF-16".equals(javaName)
                        || "ASCII".equals(javaName)
                        || "US-ASCII".equals(javaName)
                        || "Unicode".equals(javaName)
                        || "UNICODE".equals(javaName)
                        || javaName.startsWith("ISO8859")) {

                        // Not only does this EncodingImpl object explicitly
                        // handle chracters in the low range, it is
                        // also one that we know something about, without
                        // needing to call inEncoding(char ch, String encoding)
                        // for this low range
                        //
                        // By initializing the table ahead of time
                        // for these low values, we prevent the expensive
                        // inEncoding(char ch, String encoding)
                        // from being called, at least for these common
                        // encodings.
                        for (int unicode = 1; unicode < 127; unicode++) {
                            final int idx = unicode - m_explFirst;
                            if (0 <= idx && idx < RANGE) {
                                m_alreadyKnown[idx] = true;
                                m_isInEncoding[idx] = true;
                            }
                        }
                    }
                }

                /* A little bit more than optimization.
                 *
                 * We will say that any character is in the encoding if
                 * we don't have an encoding.
                 * This is meaningful when the serializer is being used
                 * in temporary output state, where we are not writing to
                 * the final output tree.  It is when writing to the
                 * final output tree that we need to worry about the output
                 * encoding
                 */
                if (javaName == null) {
                    for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
                        m_alreadyKnown[idx] = true;
                        m_isInEncoding[idx] = true;
                    }
                }
            }
        }
    }

    /**
     * This is heart of the code that determines if a given character
     * is in the given encoding. This method is probably expensive,
     * and the answer should be cached.
     * <p>
     * This method is not a public API,
     * and should only be used internally within the serializer.
     * @param ch the char in question, that is not a high char of
     * a high/low surrogate pair.
     * @param encoding the Java name of the enocding.
     *
     * @xsl.usage internal
     *
     */
    private static boolean inEncoding(char ch, String encoding) {
        boolean isInEncoding;
        try {
            char cArray[] = new char[1];
            cArray[0] = ch;
            // Construct a String from the char
            String s = new String(cArray);
            // Encode the String into a sequence of bytes
            // using the given, named charset.
            byte[] bArray = s.getBytes(encoding);
            isInEncoding = inEncoding(ch, bArray);

        } catch (Exception e) {
            isInEncoding = false;

            // If for some reason the encoding is null, e.g.
            // for a temporary result tree, we should just
            // say that every character is in the encoding.
            if (encoding == null)
            	isInEncoding = true;
        }
        return isInEncoding;
    }

    /**
     * This is heart of the code that determines if a given high/low
     * surrogate pair forms a character that is in the given encoding.
     * This method is probably expensive, and the answer should be cached.
     * <p>
     * This method is not a public API,
     * and should only be used internally within the serializer.
     * @param high the high char of
     * a high/low surrogate pair.
     * @param low the low char of a high/low surrogate pair.
     * @param encoding the Java name of the encoding.
     *
     * @xsl.usage internal
     *
     */
    private static boolean inEncoding(char high, char low, String encoding) {
        boolean isInEncoding;
        try {
            char cArray[] = new char[2];
            cArray[0] = high;
            cArray[1] = low;
            // Construct a String from the char
            String s = new String(cArray);
            // Encode the String into a sequence of bytes
            // using the given, named charset.
            byte[] bArray = s.getBytes(encoding);
            isInEncoding = inEncoding(high,bArray);
        } catch (Exception e) {
            isInEncoding = false;
        }

        return isInEncoding;
    }

    /**
     * This method is the core of determining if character
     * is in the encoding. The method is not foolproof, because
     * s.getBytes(encoding) has specified behavior only if the
     * characters are in the specified encoding. However this
     * method tries it's best.
     * @param ch the char that was converted using getBytes, or
     * the first char of a high/low pair that was converted.
     * @param data the bytes written out by the call to s.getBytes(encoding);
     * @return true if the character is in the encoding.
     */
    private static boolean inEncoding(char ch, byte[] data) {
        final boolean isInEncoding;
        // If the string written out as data is not in the encoding,
        // the output is not specified according to the documentation
        // on the String.getBytes(encoding) method,
        // but we do our best here.
        if (data==null || data.length == 0) {
            isInEncoding = false;
        }
        else {
            if (data[0] == 0)
                isInEncoding = false;
            else if (data[0] == '?' && ch != '?')
                isInEncoding = false;
            /*
             * else if (isJapanese) {
             *   // isJapanese is really
             *   //   (    "EUC-JP".equals(javaName)
             *   //    ||  "EUC_JP".equals(javaName)
             *  //     ||  "SJIS".equals(javaName)   )
             *
             *   // Work around some bugs in JRE for Japanese
             *   if(data[0] == 0x21)
             *     isInEncoding = false;
             *   else if (ch == 0xA5)
             *     isInEncoding = false;
             *   else
             *     isInEncoding = true;
             * }
             */

            else {
                // We don't know for sure, but it looks like it is in the encoding
                isInEncoding = true;
            }
        }
        return isInEncoding;
    }

    /**
     * This method exists for performance reasons.
     * <p>
     * Except for '\u0000', if a char is less than or equal to the value
     * returned by this method then it in the encoding.
     * <p>
     * The characters in an encoding are not contiguous, however
     * there is a lowest group of chars starting at '\u0001' upto and
     * including the char returned by this method that are all in the encoding.
     * So the char returned by this method essentially defines the lowest
     * contiguous group.
     * <p>
     * chars above the value returned might be in the encoding, but
     * chars at or below the value returned are definately in the encoding.
     * <p>
     * In any case however, the isInEncoding(char) method can be used
     * regardless of the value of the char returned by this method.
     * <p>
     * If the value returned is '\u0000' it means that every character must be tested
     * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
     * for surrogate pairs.
     * <p>
     * This method is not a public API.
     * @xsl.usage internal
     */
    public final char getHighChar() {
        return m_highCharInContiguousGroup;
    }

}